-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTargetPop.R
178 lines (123 loc) · 5.34 KB
/
TargetPop.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#According to our SAP 3.3.2 we will match by age, sex and poverty.
# We will also exclude homeless.
# So these are the patients we need to exclude from the target population.
colnames(panel.18)
targetpop <- panel.18 %>% filter(Race == 'Marshallese' | Language == 'Marshallese' | KOHParticipant == 1 | (Race == 'White' & Ethnicity == 'Not Hispanic or Latino'))
# make marshallese indicator variable
targetpop <- targetpop %>%
mutate(Marsh = ifelse((Race == 'Marshallese' | Language == 'Marshallese' | KOHParticipant == 1), 1, 0),
Group = ifelse(Marsh == 1, "Marshallese", "Non-Marshallese"))
# m <- targetpop %>% filter(Marsh == 1 ) #%>% summarize(max(BLACERISK))
# w <- targetpop %>% filter(Marsh == 0 )
############### OLDER ######################
older <-targetpop %>% filter(age > max(Marshallese$age))
# 67 patients
############### HOMELESS ######################
# No.column.name is housing status
# filtered housing status from panel.18 in the DataCleaningDID.R
homeless <- targetpop %>% filter(No.column.name == "Homeless Shelter" |
No.column.name == "Street" )
#colnames(targetpop)
#with(targetpop, table(No.column.name, Marsh))
############### DECEASED ######################
# ! Also what do we do with Deceased Date?
deceased <-targetpop %>% filter(DeceasedDate != "")
nrow(deceased)
# 344
class(deceased$DeceasedDate)
# "character"
deceased$DeceasedDate <- as.Date(deceased$DeceasedDate, format = "%m/%d/%Y")
class(deceased$DeceasedDate)
min(deceased$DeceasedDate)
# "2022-06-03"
# this is before the end of the DID analysis so I would remove it...
max(deceased$DeceasedDate)
# "2024-12-25"
hist(deceased$DeceasedDate, breaks = 24)
with(deceased, table( Marsh))
# 20 marshallese and 316 NHW
# A lot of them are after our DID model but I am still inclined to remove them all
############### RISK SCORES ######################
# I am surprised to see such high Risk Scores in our controls
# I don't think they make good controls if they are double the maximum Marshallese risk score.
# we will keep all people without risk scores in both groups
higher_risk <- targetpop %>% filter(BLACERISK > max(Marshallese$BLACERISK, na.rm = TRUE))
nrow(higher_risk)
# 59
############### INCOME ######################
# higher income
colnames(targetpop)
higher_income <- targetpop %>% filter(IncomeLevel > max(Marshallese$IncomeLevel, na.rm = TRUE))
nrow(higher_income)
# 641
############### BMI ######################
fp_bmi = file.path(getwd(), "Raw Data/UWDataBMIs.csv")
bmi <- read.csv(fp_bmi) # demographics
fp_new_bmi = file.path(getwd(), "Raw Data/UW BMI Missing List .csv")
UW.BMI.Missing.List. <- read.csv(fp_new_bmi) # demographics
# bmi <- read.csv("Raw Data/UWDataBMIs.csv")
bmi$Date <- mdy(bmi$Date)
#bmi.nona <- bmi %>% filter(!is.na(Date) & !is.na(BMI))
# update this with new BMI list
#UW.BMI.Missing.List. <- read.csv("~/BIOST CLASSES/597 Capstone with Lloyd Mancl/597 Capstone/CHAS-capstone/Raw Data/UW BMI Missing List .csv")
colnames(bmi)
# match up the column names so we can merge better
colnames(UW.BMI.Missing.List.)[1] <- c("UniqueIdentifier")
colnames(UW.BMI.Missing.List.)[2] <- c("BMI")
full_bmi <- full_join(bmi, UW.BMI.Missing.List.) # , by = c("UniqueIdentifier" , "uniqueIdentifier")
# mark who are Marshallese and see if there are NHW with higher BMI
colnames(targetpop)
colnames(full_bmi)
colnames(full_bmi)[1] <- c("UniqueID")
targetpop <- left_join(targetpop ,full_bmi)
Max_Marshallese_BMI <- targetpop %>% filter(Marsh == 1)
max(Max_Marshallese_BMI$BMI, na.rm = TRUE)
# 73
Max_NHW_BMI <- targetpop %>% filter(Marsh == 0)
max(Max_NHW_BMI$BMI, na.rm = TRUE)
# 111436.5
# ! note there are some zero BMI in each group as well but since they are in both I think we can accept these?
higher_BMI <- targetpop %>% filter(BMI >max(Max_Marshallese_BMI$BMI, na.rm = TRUE))
nrow(higher_BMI)
# 331 including BMI in the thousands which are clearly out of range
# we might have to merge this to targetpop
exclude <- c(higher_BMI$UniqueID, higher_income$UniqueID, higher_risk$UniqueID, deceased$UniqueID, homeless$UniqueID, older$UniqueID)
length(exclude)
# 2396
exclude <- unique(exclude)
length(exclude)
# 2187 which is 209 duplicates between the groups
# Remove these groups too
# Final Target Pop for our analysis
targetpop_DID <- targetpop %>% filter(!UniqueID %in% exclude) # not in exclude list is !UniqueID %in% exclude
intersect(targetpop_DID$UniqueID, exclude)
# good it worked
nrow(targetpop_DID) == nrow(targetpop) - length(exclude)
# !FALSE
# Visually check the covariates now with histograms
hist(targetpop_DID$age)
# 2 groups
# hist(targetpop_DID$age[Marsh == 1 ], freq = FALSE, col = "grey",
# border = NA, xlab = "",
# ylab = "", yaxt = "n", breaks = 30,
# main = "Distribution of Age between Marshallese and Non-Hispanic White patients in DID Model",
# xlim = c(0, 1), ylim = c(0, 4.5))
# hist(targetpop_DID$age[Marsh == 0], freq = FALSE,
# add = TRUE,
# # breaks = 30,
# col=NA)
visuals_DID <- function(demographic) {
targetpop_DID %>%
ggplot(aes(x = {{demographic}}))+
geom_density()+
facet_wrap(~Marsh)
}
visuals_DID(age)
visuals_DID(BMI)
visuals_DID(IncomeLevel)
visuals_DID(Sex)
visuals_DID(BLACERISK)
visuals_DID(No.column.name) # homeless status shows the options
visuals_DID(DeceasedDate) # totally blank all of the patients are still alive
visuals_DID(ClinicLocation)
# ! check table 1 of these new groups