forked from fivethirtyeight/guns-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SHR_parser.R
153 lines (127 loc) · 7.14 KB
/
SHR_parser.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# This code parses data from the FBI's Supplementary Homicide Reports for FiveThirtyEight's
# "Gun Death in America" project.
# Data via the National Archive of Criminal Justice Data: https://www.icpsr.umich.edu/icpsrweb/content/NACJD/guides/ucr.html
# Questions/comments/corrections to [email protected]
# Thanks to Carl Bialik for his assistance wtih coding, debugging and analysis.
require(dplyr)
require(readr)
# Download data as zipped tsv files here: https://www.icpsr.umich.edu/icpsrweb/content/NACJD/guides/ucr.html
# Codebooks are included with downloads.
# The code below should work without modifications for the years 2006 to 2014. File layouts vary somewhat in earlier years,
# and code will need to be adjusted accordingly.
# This code will output a clean dataframe. Un-comment line near end of function to also save the data.
# Save tsv files to working directory as 'SHR_YEAR.tsv'
year <- 2010 # user entry
# Un-comment and run the line of code below AFTER generating the function below
# SHR_2010 <- SHR_parser(year) # or whatever you want to call your data frame
SHR_parser <- function(year) {
# Columns don't read in right if you don't pre-assign column types.
coltypes <- paste0("cicniiiiiicciiicicc", paste0(rep("c", times = 133), collapse = ""))
filename <- paste0("SHR_", year, ".tsv") # Change this if you're using a different naming convention
# Read in file as a tsv
raw_file <- read_tsv(eval(filename), col_types = coltypes)
# Assign names for JUST the incident section. The victim/offender sections will follow.
names(raw_file)[1:18] <- c("id_code", "state_code", "ORI_code", "group", "geo_division", "year", "pop", "county",
"MSA", "MSA_indic", "agency_name", "state_name", "offense_month", "last_update", "action_type",
"offense_type", "incident_number", "situation")
# Assign unique id to each incident so we can join them later.
raw_file$unique_id <- 1:nrow(raw_file)
# From here, we read each victim and offender individually. We preserve the
# unique ids so we can join them up again when we want.
# First split off the first victim
working <- raw_file %>%
select(1:18, 19:22, unique_id) %>%
mutate(vic_off = "vic",
vic_off_num = 1)
names(working)[19:22] <- c("age", "sex", "race", "ethnicity")
working$age[working$age == "00" | working$age == "0"] <- NA # We're coding children less than one year old as age "0"
working$age[working$age == "NB" | working$age == "BB"] <- 0
working$age <- as.numeric(working$age)
# Same for the first offender
working2 <- raw_file %>%
select(1:18, 23:30, unique_id) %>%
mutate(vic_off = "off",
vic_off_num = 1)
names(working2)[19:26] <- c("age", "sex", "race", "ethnicity", "weapon", "relat", "circ", "sub_circ")
working2$age[working2$age == "00" | working2$age == "0"] <- NA
working2$age[working2$age == "NB" | working2$age == "BB"] <- 0
working2$age <- as.numeric(working2$age)
# Now join those together. Note that using bind_rows, we can bind the data frames despite unequal number of columns.
working <- bind_rows(working, working2)
# Now we do essentially the same thing, but in bulk using a for loop.
# Each loop takes the incident information (plus unique id) but takes a different
# victim/offender location (victim 2, victim 3, etc).
# Victim records
for (i in 1:10){
start <- 33 + (i-1)*4
end <- start + 3
working2 <- raw_file %>%
select(1:18, start:end, unique_id) %>%
mutate(vic_off = "vic",
vic_off_num = i+1)
names(working2)[19:22] <- c("age", "sex", "race", "ethnicity")
working2$age[working2$age == "00" | working2$age == "0"] <- NA
working2$age[working2$age == "NB" | working2$age == "BB"] <- 0
working2$age <- as.numeric(working2$age)
working2 <- working2 %>%
filter(!(is.na(sex) & is.na(age) & is.na(race) & is.na(ethnicity)))
working <- bind_rows(working, working2)
}
# Offender records
for (i in 1:10){
start <- 73 + (i-1)*8
end <- start + 7
working2 <- raw_file %>%
select(1:18, start:end, unique_id) %>%
mutate(vic_off = "off",
vic_off_num = i+1)
names(working2)[19:26] <- c("age", "sex", "race", "ethnicity", "weapon", "relat", "circ", "sub_circ")
working2$age[working2$age == "00" | working2$age == "0"] <- NA
working2$age[working2$age == "NB" | working2$age == "BB"] <- 0
working2$age <- as.numeric(working2$age)
working2 <- working2 %>%
filter(!(is.na(sex) & is.na(age) & is.na(race) & is.na(ethnicity) & is.na(weapon) & is.na(relat) & is.na(circ) & is.na(sub_circ)))
working <- bind_rows(working, working2)
}
# Next we identify offenders who used a gun. (Note that no weapon is coded for victims.)
# Firearm codes:
# 11 - firearm type not stated
# 12 - handgun
# 13 - rifle
# 14 - shotgun
# 15 - other gun
working <- working %>%
mutate(gun = ifelse(weapon %in% c(11, 12, 13, 14, 15), 1, 0)) # Identify if a gun was used
# Next we identify gun incidents. These are *incidents* in which a gun was used.
# Note that we're coding these at the incident level -- this variable doesn't tell us which
# offender USED the gun (the 'gun' variable does that). We just want to know whether an incident was a "gun incident."
# This will apply to both victims and offenders. (As a result, victims will be coded as "gun victims" whether or not
# they were personally shot.)
working <- working %>%
filter(vic_off == "off") %>%
mutate(gun_used = ifelse(weapon %in% c(11, 12, 13, 14, 15), 1, 0)) %>% # Identify if a gun was used
group_by(unique_id) %>%
select(unique_id, gun_used) %>%
summarize(gun_used = sum(gun_used)) %>%
mutate(gun_used = ifelse(gun_used > 0, 1, 0)) %>% # Reduce this to a binary -- we only care about one gun per incident
left_join(working, ., by = "unique_id") # join back to main record
# Same basic approach for "justifiable homicide." This will produce three dummy variables:
# One for justifiable homicides by law enforcement; one for justifiable homicides by civilians; and one for
# total justifiable homicide (regardless of type). Note that these are coded at the incident level -- we don't
# specify WHICH offender was justified (although we could do so with minor coding adjustments). It is not possible,
# from the public SHR data, to identify which victims in multi-victim incidents died through justifiable homicide.
working <- working %>%
filter(vic_off == "off", !is.na(circ)) %>%
mutate(police = ifelse(circ == 80, 1, 0),
other_just = ifelse(circ == 81, 1, 0),
just = ifelse(police == 1 | other_just == 1, 1, 0)) %>%
group_by(unique_id) %>%
summarize(police = sum(police), other_just = sum(other_just), just = sum(just)) %>%
mutate(police = ifelse(police > 0, 1, 0),
other_just = ifelse(other_just > 0, 1, 0),
just = ifelse(just > 0, 1, 0)) %>%
select(unique_id, just, police, other_just) %>%
left_join(working, ., by = "unique_id")
# save.file(working, file = paste0("SHR_", year, ".RData")) # Un-comment this line to save file
working # output the final product
}