-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTickPathogen_02_DataCleanUp.R
109 lines (84 loc) · 4.03 KB
/
TickPathogen_02_DataCleanUp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
### ABOUT
## Script to explore NEON Tick Pathogen data
## 30 Jan 2020
## WEM
library(dplyr)
library(tidyr)
library(lubridate)
library(ggplot2)
#### read in data #####
tickPathogen_raw <- read.csv("data_raw/filesToStack10092/stackedFiles/tck_pathogen.csv")
colnames(tickPathogen_raw)
##### about the data ######
# each tick x pathogen test is a row, so the same tick is tested multiple times, for a different pathogen
# tick individuals have a unique testingID
# tick individuals are part of a subsample (all from the same subsampleID)
##### simplifying the data ########
# remove unneeded columns
tp <- tickPathogen_raw %>% select(subsampleID, domainID, siteID, plotID,
nlcdClass, decimalLatitude,
decimalLongitude, elevation,
collectDate, testingID, individualCount,
sampleCondition,testResult, testPathogenName)
#### explore individual columns #####
# for info see variables file in the data directory
# relevant columns:
# testingID (individual tick ID)
# siteID (where tick was collected)
# collectDate (when tick was collected)
# testResult (not tested, negative, positive)
# testPathogenName (which pathogen result corresponds to)
table(tp$domainID)
table(tp$siteID)
length(unique(tp$subsampleID))
length(unique(tp$testingID))
table(tp$testPathogenName, tp$testResult)
table(tp$sampleCondition)
# looks like Borrelia spp have best data; think about using this rather than a specific species
#### filter low quality data/errors #####
tp <- tp %>% filter(!is.na(testPathogenName), sampleCondition == "OK")
#### re-format and aggregate data #######
# pathogen results: make not tested NAs and this column numeric
levels(tp$testResult) <- c(NA, "0", "1")
tp$testResult <- as.numeric(as.character(tp$testResult))
# remove not tested
tp <- tp %>% filter(!is.na(testResult))
table(tp$testResult)
# get date columns
tp$Date <- as.Date(tp$collectDate)
tp$Year <- year(tp$Date)
tp$Month <- month(tp$Date)
# separate the testingID into other columns
# some of this ID is redundant with other columns so we'll toss them from table
tp <- tp %>% separate(col = subsampleID, sep = "\\.", into = c("throw", "throw2", "Taxonomy", "Lifestage"), remove = FALSE) %>% select(-"throw", -"throw2")
# make tick data in wide format, with each row a single tick, each column a pathogen
# note new dplyr function pivot_wider() which replaces spread()
# first aggregate the non-pathogen test columns (other covariates)
tp_wide_cols <- tp %>% group_by(testingID) %>% summarise_at(vars(subsampleID:sampleCondition, Date:Month), first)
# now re-arrange so that each row is a tick and there are multiple columns for each pathogen ("wide" format), join with other covariates
tp_wide <- tp %>% pivot_wider(id_cols = testingID, names_from = testPathogenName,values_from = testResult) %>% left_join(tp_wide_cols, by = "testingID")
rm(tp_wide_cols)
tp_wide <- tp_wide %>% select(-`HardTick DNA Quality`)
# aggregate by population
tp_wide %>% group_by(siteID, plotID, Date, Taxonomy) %>%
summarise(Borrelia.Prev = sum(`Borrelia sp.`)/n(),
nlcdClass = first(nlcdClass),
elevation = first(elevation),
Year = first(Year),
subsampleID = first(subsampleID),
subsampleID2 = last(subsampleID),
Month = first(Month)) -> tp_agg
#### visualize data ######
ggplot(data = tp_agg, aes(x=Date, y = Borrelia.Prev, group = siteID))+
geom_line(aes(color = siteID))
ggplot(data = tp_agg, aes(x=Year, y = Borrelia.Prev)) +
geom_jitter(aes(group=siteID, color = siteID), alpha = .7)
ggplot(data = tp_agg, aes(x=Month, y = Borrelia.Prev)) +
geom_jitter(aes(group=siteID), alpha = .2)
ggplot(data = tp_agg, aes(x=nlcdClass, y = Borrelia.Prev)) +
geom_violin()
ggplot(data = tp_agg, aes(x=siteID, y = Borrelia.Prev)) +
geom_violin()
tp_agg <- data.frame(tp_agg)
write.csv(tp_agg, "data_derived/Tick_Borrelia_Prev_Aggregated.csv", row.names = FALSE)
write.csv(tp_wide, "data_derived/Tick_Pathogen_Individual.csv", row.names = FALSE)