-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcarabids_03_EDA_carabid-species_explo.R
262 lines (226 loc) · 9.97 KB
/
carabids_03_EDA_carabid-species_explo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# This script explores the differences in species ED between the parataxonomist, expert taxonomist, and barcode datasets on NEON ground beetles, then visualizes the data
# Para vs expert comparison written by Max
library(dplyr)
library(stringr) #word()
library(ggplot2)
library(forcats) #fct_reorder
library(arm) #display
# Load carabid data
load(file="data_derived/carabids_NIWO.Rdata")
list2env(carabid_abund, .GlobalEnv)
list2env(carabid_barcode, .GlobalEnv)
### First compare barcode and expert taxonomist
barcode <- bet_BOLDtaxonomy %>%
mutate(individualID = sampleID, bc_sciname = NA) %>%
as_tibble
expert <- bet_expertTaxonomistIDProcessed %>%
mutate(expert_sciname = paste(genus, specificEpithet)) %>%
as_tibble
# Some barcode species are subspecies. Remove subspecies, leaving genus/species
for (i in 1:nrow(barcode)) {
if (sapply(strsplit(barcode$species[i]," "),length) == 2) {
barcode$bc_sciname[i] <- barcode$species[i]
} else {
barcode$bc_sciname[i] <- paste(word(barcode$species[i], 1:2),collapse=" ")
}
}
# AIS couldn't get this to work in simple ifelse format: barcode %>% mutate(bc_sciname = ifelse( sapply(strsplit(species," "),length) == 2, species, paste(word(species, 1:2),collapse=" ")))
# Join barcode and expert data
bc_exp_taxon_df <- expert %>%
dplyr::select(individualID, expert_sciname) %>%
left_join(distinct(barcode, individualID, bc_sciname))
# What percentage of barcode IDs match expert IDs
bc_exp_taxon_df %>%
filter(!is.na(bc_sciname)) %>% #samples that expert IDed and where para identified to species
mutate(same_species = expert_sciname == bc_sciname) %>%
group_by(expert_sciname) %>%
summarize(pct_correct = mean(same_species, na.rm = TRUE),
tot_correct = sum(same_species, na.rm = TRUE),
n = n()) %>%
arrange(-n) %>%
data.frame
#100%!
### Now compare barcode/expert to parataxonomist
para <- bet_parataxonomistID %>%
mutate(para_sciname = scientificName) %>%
dplyr::select(individualID, morphospeciesID, taxonRank, para_sciname) %>%
as_tibble
# Join para and expert/barcode data, and evaluate discrepancies in species ids
taxon_df <- left_join(para, bc_exp_taxon_df)
# para_sciname and expert_sciname don't always match
taxon_df %>%
filter(taxonRank == "species") %>%
summarize(mean(para_sciname == expert_sciname, na.rm = TRUE))
#0.97 match between specimen identified by expert taxonomist with parataxonomist ID
# Are any species perfectly identified?
taxon_df %>%
filter(taxonRank == "species", !is.na(expert_sciname)) %>% #samples that expert IDed and where para identified to species
mutate(same_species = para_sciname == expert_sciname) %>%
group_by(para_sciname) %>%
summarize(pct_correct = mean(same_species, na.rm = TRUE),
tot_correct = sum(same_species, na.rm = TRUE),
n = n()) %>%
arrange(-n) %>%
data.frame
# In all but one case (Cymindis cribricollis), the para and expert IDs were either 0% or 100% matching
# look at some of the discrepancies
taxon_df %>%
filter(taxonRank == "species",
para_sciname != expert_sciname) %>%
select(individualID, para_sciname, expert_sciname) %>%
arrange(para_sciname) %>%
data.frame
# Three mismatches checked with taxize pckg had different taxon serial numbers
# Plot discrepancies for samples identified to the species level
taxon_df %>%
filter(taxonRank == "species", !is.na(expert_sciname)) %>%
count(para_sciname, expert_sciname) %>%
arrange(n) %>%
mutate(discrepancy = para_sciname != expert_sciname) %>%
ggplot(aes(x = para_sciname,
y = expert_sciname,
color = discrepancy)) +
geom_point(aes(size = n)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
xlab("Species ID from parataxonomist") +
ylab("Species ID from expert taxonomist") +
scale_color_manual(values = c("black", "red"))
# Plot morphospecies IDs with expert IDs:
taxon_df %>%
filter(!is.na(morphospeciesID), !is.na(expert_sciname)) %>% #samples where morphospeciesID and expert ID exists,
count(morphospeciesID, expert_sciname) %>%
ggplot(aes(x = fct_reorder(morphospeciesID, n),
y = fct_reorder(expert_sciname, n))) +
geom_point(aes(size = n)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("Morphospecies from parataxonomist") +
ylab("Species ID from expert taxonomist")
# all but one morphospecies map to one expert species ID.
# the D13.2016.MorphBU morphospecies is a mixture of two species
# How many morphospecies occur with each parataxonomist ID?
# note: if the parataxonomist IDed to species, then no morphospecies was assigned
taxon_df %>%
group_by(para_sciname) %>%
summarize(morpho_n = n_distinct(morphospeciesID, na.rm = TRUE),
n = n()) %>%
arrange(-n) %>%
data.frame
# 3 para ID's had more than one morphospecies
# Harpalus sp. (para ID) had 6 morphospecies and Amara sp. (para ID) had 9
# Doesn't seem like using the morphospeciesID is helpful
# Plot discrepancies for all samples, excluding para ID == expert ID
taxon_df %>%
count(para_sciname, expert_sciname) %>%
arrange(n) %>%
mutate(discrepancy = para_sciname != expert_sciname) %>%
filter(discrepancy == TRUE) %>%
ggplot(aes(x = para_sciname,
y = expert_sciname)) +
geom_point(aes(size = n)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
xlab("Species ID from parataxonomist") +
ylab("Species ID from expert taxonomist") +
scale_color_manual(values = c("red"))
# Visualize those species with 100% match between para and expert IDs and with n>20
select_spp <- taxon_df %>%
mutate(same_species = para_sciname == expert_sciname) %>%
group_by(para_sciname) %>%
summarize(pct_correct = mean(same_species, na.rm = TRUE),
tot_correct = sum(same_species, na.rm = TRUE), #number of individuals for this species where para = expert where expert exists
n = n()) %>% #number of individuals with para_sciname
filter(pct_correct == 1, n > 20) %>%
pull(para_sciname)
# Plot abundance of selected Niwot species over time
taxon_df %>%
full_join(bet_parataxonomistID %>%
dplyr::select(individualID, plotID , trapID , collectDate)) %>%
filter(para_sciname %in% select_spp) %>%
ggplot() +
geom_bar(aes(x = collectDate, fill = para_sciname)) +
facet_wrap(. ~ para_sciname)
# Plot taxa by year (thanks SN)
taxon_df %>%
full_join(bet_parataxonomistID %>%
dplyr::select(individualID, plotID , trapID , collectDate)) %>%
filter(para_sciname %in% select_spp) %>%
mutate(year = lubridate::year(collectDate),
month = lubridate::month(collectDate),
day = lubridate::day(collectDate)) %>%
group_by(para_sciname, year) %>%
summarize(n=n()) %>%
ggplot() +
geom_line(aes(x = year, y = n, colour = para_sciname, group = para_sciname)) +
geom_point(aes(x = year, y = n, colour = para_sciname)) +
xlab("Collection Year")
# Pterostichus restrictus came out of nowhere!
# Plot taxa by month through the years
taxon_df %>%
full_join(bet_parataxonomistID %>%
dplyr::select(individualID, plotID , trapID , collectDate)) %>%
filter(para_sciname %in% select_spp) %>%
mutate(year = lubridate::year(collectDate),
month = lubridate::month(collectDate),
day = lubridate::day(collectDate)) %>%
group_by(para_sciname, year, month) %>%
summarize(n=n()) %>%
ggplot() +
geom_line(aes(x = month, y = n, colour = para_sciname, group = para_sciname)) +
geom_point(aes(x = month, y = n, colour = para_sciname)) +
xlab("Collection Month") +
facet_grid(year ~ .)
# July has the most abundance, but may have the most collection dates, too
# Plot taxa through time by collection day
taxon_df %>%
full_join(bet_parataxonomistID %>%
dplyr::select(individualID, plotID , trapID , collectDate)) %>%
filter(para_sciname %in% select_spp) %>%
mutate(year = lubridate::year(collectDate),
month = lubridate::month(collectDate),
day = lubridate::day(collectDate)) %>%
group_by(para_sciname, year, month, day) %>%
summarize(n=n()) %>%
ggplot() +
#geom_line(aes(x = day, y = n, colour = para_sciname, group = para_sciname)) +
geom_point(aes(x = day, y = n, colour = para_sciname)) +
xlab("Collection Day") +
facet_grid(year ~ month)
# Plot spatial arrangement of plots, labeled with habitat
# add lat and long poitns
bet_fielddata %>%
dplyr::select(plotID, nlcdClass, decimalLatitude, decimalLongitude) %>%
ggplot() +
geom_point(aes(x = decimalLongitude, y = decimalLatitude, colour = nlcdClass))
# Visualize what species are in which nlcdClass
taxon_df %>%
full_join(bet_parataxonomistID %>%
dplyr::select(individualID, plotID , trapID , collectDate)) %>%
left_join(bet_fielddata %>%
dplyr::select(plotID , nlcdClass)) %>%
filter(para_sciname %in% select_spp) %>%
ggplot() +
geom_bar(aes(x = collectDate, fill = para_sciname)) +
facet_wrap(para_sciname ~ nlcdClass)
# cool variables to look at: bet_fielddata$nativestatuscode
bet_parataxonomistID %>%
filter(scientificName %in% select_spp) %>%
select(scientificName, nativeStatusCode) %>%
summarize(mean(nativeStatusCode == "N") )
# All species are classified as native
# Exploration with compiled df's (carabids_03_zcompiled_df) --------------
clean_dat <- read.csv("data_derived/model_df_by_species_in_sample.csv")
# Plot taxa's abundance at each trap through time
quartz()
clean_dat %>%
filter(para_sciname %in% select_spp) %>%
group_by(para_sciname, plot_trap, col_year, sp_abund) %>%
summarize(sum_abund = sum(sp_abund)) %>%
mutate(occ = as.character(ifelse(sum_abund>0,1,0))) %>%
ggplot(aes(x = col_year,
y = plot_trap,
color = occ)) +
geom_point(aes(size = sum_abund)) +
scale_size_area() + #value 0 will have zero size
scale_color_manual(values=c("grey","darkblue")) +
theme(axis.text.x = element_text(size=6)) +
facet_grid(. ~ para_sciname)
#ggsave("output/species_abund_by_year_by_trap.png")