-
Notifications
You must be signed in to change notification settings - Fork 18
/
exercise3.R
234 lines (154 loc) · 6.92 KB
/
exercise3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# -----------------------------------------------------------------------------
# This program shows how to link the MEPS-HC Medical Conditions file
# to the Office-based medical visits file for data year 2021 to estimate:
# - Total number of people w office-based visit for cancer
# - Total number of office visits for cancer
# - Total expenditures for office visits for cancer
# - Percent of people with office visit for cancer
# - Average per-person expense for office visits for cancer
#
# Input files:
# - h229g.dta (2021 Office-based medical visits file)
# - h231.dta (2021 Conditions file)
# - h229if1.dta (2021 CLNK: Condition-Event Link file)
# - h233.dta (2021 Full-Year Consolidated file)
#
# Resources:
# - CCSR codes:
# https://github.com/HHS-AHRQ/MEPS/blob/master/Quick_Reference_Guides/meps_ccsr_conditions.csv
#
# - MEPS-HC Public Use Files:
# https://meps.ahrq.gov/mepsweb/data_stats/download_data_files.jsp
#
# - MEPS-HC online data tools:
# https://datatools.ahrq.gov/meps-hc
#
# -----------------------------------------------------------------------------
# Install/load packages and set global options --------------------------------
# Can skip this part if already installed
# install.packages("survey") # for survey analysis
# install.packages("foreign") # for loading SAS transport (.ssp) files
# install.packages("haven") # for loading Stata (.dta) files
# install.packages("dplyr") # for data manipulation
# install.packages("tidyr") # for data manipulation
# install.packages("devtools") # for loading "MEPS" package from GitHub
#
# devtools::install_github("e-mitchell/meps_r_pkg/MEPS") # easier file import
# Load libraries (run this part each time you re-start R)
library(survey)
library(foreign)
library(haven)
library(dplyr)
library(tidyr)
library(MEPS)
# Set survey option for lonely PSUs
options(survey.lonely.psu='adjust')
# Additional option for adjusting variance for lonely PSUs within a domain
# - More info at https://r-survey.r-forge.r-project.org/survey/html/surveyoptions.html
# - Not running in these exercises, so SEs will match SAS, Stata
#
# options(survey.adjust.domain.lonely = TRUE)
# Load datasets ---------------------------------------------------------------
# OB = Office-based medical visits file (record = medical visit)
# COND = Medical conditions file (record = medical condition)
# CLNK = Conditions-event link file (crosswalk between conditions and
# events, including PMED events)
# FYC = Full-year-consolidated file (record = MEPS sample person)
# Option 1 - load data files using read_MEPS from the MEPS package
ob21 = read_MEPS(year = 2021, type = "OB")
cond21 = read_MEPS(year = 2021, type = "COND")
clnk21 = read_MEPS(year = 2021, type = "CLNK")
fyc21 = read_MEPS(year = 2021, type = "FYC")
# Option 2 - load Stata data files using read_dta from the haven package
# ob21 <- read_dta("C:/MEPS/h229g.dta")
# cond21 <- read_dta("C:/MEPS/h231.dta")
# clnk21 <- read_dta("C:/MEPS/h229if1.dta")
# fyc21 <- read_dta("C:/MEPS/h233.dta")
# Keep only needed variables ------------------------------------------------
# Browse variables using MEPS-HC data tools variable explorer:
# -> http://datatools.ahrq.gov/meps-hc#varExp
ob21x = ob21 %>%
select(PANEL, DUPERSID, EVNTIDX, EVENTRN, OBDATEYR, OBDATEMM, OBXP21X)
cond21x = cond21 %>%
select(PANEL, DUPERSID, CONDIDX, ICD10CDX, CCSR1X:CCSR3X)
fyc21x = fyc21 %>%
select(PANEL, DUPERSID, AGELAST, PERWT21F, VARSTR, VARPSU)
# Prepare data for estimation -------------------------------------------------
# Subset condition records to CANCER (any CCSR = "NEO...")
# + FAC006 (Encounters for antineoplastic therapies)
# - NEO073 (Benign neoplasms)
view_conditions = cond21 %>%
count(CCSR1X, CCSR2X, CCSR3X)
View(view_conditions)
cancer <- cond21x %>%
filter(
grepl("NEO", CCSR1X) |
grepl("NEO", CCSR2X) |
grepl("NEO", CCSR3X) |
CCSR1X == "FAC006" |
CCSR2X == "FAC006" |
CCSR3X == "FAC006" ) %>%
filter(! (CCSR1X == "NEO073" |
CCSR2X == "NEO073" |
CCSR3X == "NEO073" ) )
# view ICD10-CCSR combinations for cancer
cancer %>%
count(ICD10CDX, CCSR1X, CCSR2X, CCSR3X)
# >> Note that same person can multiple cancers, and can even have multiple
# conditions with the same ICD10CDX and CCSR values
# >> Example:
cancer %>% filter(DUPERSID == '2320589102')
# Merge cancer conditions with OB event file, using CLNK as crosswalk
# >> use multiple = "all" option for many-to-many merge
cancer_merged <- cancer %>%
inner_join(clnk21, by = c("PANEL", "DUPERSID", "CONDIDX"), multiple = "all") %>%
inner_join(ob21x, by = c("PANEL", "DUPERSID", "EVNTIDX"), multiple = "all") %>%
mutate(ob_visit = 1)
# QC: check that EVENTYPE = 1 for all rows
clnk21 %>% count(EVENTYPE)
cancer_merged %>% count(EVENTYPE)
# >> Check events for example person:
cancer_merged %>% filter(DUPERSID == '2320589102')
# De-duplicate on EVNTIDX so we don't count the same event twice
# >> Example of same event (EVNTIDX) for treating multiple cancer
cancer_merged %>% filter(DUPERSID == '2326533101')
cancer_unique = cancer_merged %>%
distinct(PANEL, DUPERSID, EVNTIDX, OBXP21X, ob_visit)
# >> Check example person:
cancer_unique %>% filter(DUPERSID == '2326533101')
# Aggregate to person-level --------------------------------------------------
pers = cancer_unique %>%
group_by(DUPERSID) %>%
summarize(
pers_XP = sum(OBXP21X), # total person exp. for cancer office visits
pers_nvisits = sum(ob_visit)) # total number of cancer office visits
# Add indicator variable
pers = pers %>%
mutate(any_OB = 1)
# Merge onto FYC file ---------------------------------------------------------
# >> Need to capture all Strata (VARSTR) and PSUs (VARPSU) for all MEPS sample
# persons for correct variance estimation
fyc_cancer <- fyc21x %>%
full_join(pers, by = "DUPERSID") %>%
# replace NA with 0
replace_na(list(pers_nvisits = 0, any_OB = 0))
# QC: should have same number of rows as FYC file
nrow(fyc21x) == nrow(fyc_cancer)
# Define the survey design ----------------------------------------------------
meps_dsgn <- svydesign(
id = ~VARPSU,
strata = ~VARSTR,
weights = ~PERWT21F,
data = fyc_cancer,
nest = TRUE)
cancer_dsgn = subset(meps_dsgn, any_OB == 1)
# Calculate estimates ---------------------------------------------------------
# National Totals:
svytotal(~ any_OB + # Total people w/ office visit for cancer
pers_nvisits + # Total number of office visits for cancer
pers_XP, # Total expenditures for office visits for cancer
design = cancer_dsgn)
# Percent of people with office visit for cancer
svymean( ~any_OB, design = meps_dsgn)
# Average per-person expense for office visits for cancer
svymean( ~pers_XP, design = cancer_dsgn)