-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02-nmfs-cohort-selection.R
184 lines (165 loc) · 5.32 KB
/
02-nmfs-cohort-selection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
source("R/header.R")
source("R/functions.R")
## Read back after further annotation by mentors
ss_new <- read_sheet(
signup_sheet_mentor_cp,
sheet = annotated_ws,
skip = 3 # Verify this in the sheet
)
small_divs <- ss_new |>
count(division) |>
filter(n <= 5) |>
pull(division) |>
na.omit()
ss_new <- ss_new |>
# Add new derived columns
mutate(
# Treat NA in cohort column as "no"
across(starts_with("cohort_"), \(x) ifelse(is.na(x), "no", x)),
new_office = grepl("(OPR)|(OST)|(AKRO)|(GARFO)", division),
points = is_true_v(new_hire) + is_true_v(si) + is_true_v(supervisor) + is_true_v(new_office),
none_avail = cohort_a == "no" & cohort_b == "no" & cohort_c == "no",
priority = ((!prev_champion & !is_true_v(mentor) & points > 0) | is_true_v(override_yes)) &
!is_true_v(none_avail) ,
## fewer than 5 in a division, accept all
priority = priority | (!is_true_v(none_avail) & division %in% small_divs),
# PIFSC can't do cohort B
cohort_b = ifelse(is_true_v(division == "PIFSC"), "no", cohort_b)
)
# For reproducible selection
# for (i in 1:20) {
i <- 20
set.seed(i)
ss_picked <- ss_new |>
pivot_longer(
cols = c(cohort_a, cohort_b, cohort_c),
names_to = "cohort",
names_transform = \(x) toupper(gsub("cohort_", "", x)),
values_to = "cohort_available"
) |>
mutate(
cohort_weight = case_when(
cohort_available == "yes" ~ 3,
cohort_available == "if necessary" ~ 2,
grepl("prefer other", cohort_available, ignore.case = TRUE) ~ 2,
grepl("okay", cohort_available, ignore.case = TRUE) ~ 2,
grepl("yes.+but", cohort_available, ignore.case = TRUE) ~ 1,
grepl("cannot make", cohort_available, ignore.case = TRUE) ~ 1,
grepl("except", cohort_available, ignore.case = TRUE) ~ 1,
grepl("rather not", cohort_available, ignore.case = TRUE) ~ 1,
cohort_available == "unsure" ~ 1,
.default = 0
)
) |>
group_by(priority, email_address) |>
# Next assign priority with > 1 cohort choice, choose "most available cohort"
# based on cohort weight. This should have assigned all "priority" to a
# cohort
mutate(
rand = rnorm(n()),
pick = cohort_weight > 0 & cohort_weight == max(cohort_weight) &
rand == max(rand[cohort_weight == max(cohort_weight)]) # break ties when equal weights (i.e., 2 or 3 "yes"s)
)
cohort_assigned <- ss_picked |>
ungroup() |>
filter(pick) |>
select(-(cohort_available:pick)) |>
arrange(cohort, desc(priority), desc(points))
# Did we get everyone?
setdiff(ss_new$email_address, cohort_assigned$email_address)
setdiff(cohort_assigned$email_address, ss_new$email_address)
print(i)
cohort_assigned |>
count(priority, cohort) |>
print()
# }
## Fix allocation of teams: no more than 18 across cohorts
## Big divisions
max_total <- 18
## Make a data.frame of the maximum number of non-priority
## applicants that can be accepted, across cohorts
big_divs <- cohort_assigned |>
count(division, priority) |>
filter(sum(n) > max_total, .by = "division") |>
filter(priority) |>
summarise(
max_extra_accepted = max_total - n,
.by = "division"
)
# Initial selection without enforcing quotas
cohort_selection_init <- cohort_assigned |>
mutate(
number = seq_len(n()),
accepted = number <= 40,
.by = "cohort"
)
cohort_selection_init |>
filter(accepted) |>
count(division, name = "n_selected")
# Find those overrepresented
over_reps <- cohort_selection_init |>
filter(accepted) |>
count(division, name = "n_selected") |>
filter(n_selected > max_total) |>
left_join(big_divs, by = "division") |>
left_join(cohort_selection_init, by = "division") |>
filter(accepted, !priority)
# Intentionally use sample_n() even though it's superseded because it allows
# different size per group
to_remove <- if (nrow(over_reps) > 0) {
over_reps |>
group_by(division) |>
mutate(n_keep = n_selected - max_extra_accepted) |>
sample_n(size = max(n_keep, na.rm = TRUE)) |>
pull(email_address)
} else {
""
}
cohort_selection <- cohort_selection_init |>
mutate(
# Move those removed to the end
number = ifelse(
email_address %in% to_remove |
grepl("fay lab", team_name, ignore.case = TRUE),
999,
number
)
) |>
group_by(cohort) |>
arrange(number) |>
mutate(
number = seq_len(n()),
accepted = number <= 40,
status = ifelse(accepted, "accepted", "waitlist"),
si_name = str_extract(
tolower(paste(team_name, team_needs, briefly_describe)),
"(fdd)|(cefi)|(\\bpam\\b)|(\\baa\\b)|(acoustics)|(omics)|(socioecon)|(\\bsi\\b)"
)
) |>
left_join(
ss_new |>
select(email_address, cohort_a, cohort_b, cohort_c),
by = "email_address"
) |>
select(
cohort,
division,
first_name,
last_name,
team_name,
si_name,
status,
everything(),
-x,
-accepted,
-previously_reviewed,
-none_avail
) |>
arrange(cohort, status, division)
# Summarize number of each grouping in each cohort
accepted_summary <- pivot_summary(cohort_selection, status)
division_summary <- pivot_summary(cohort_selection, division)
si_name_summary <- pivot_summary(cohort_selection, si_name)
team_name_summary <- pivot_summary(cohort_selection, team_name)
sheet_add(signup_sheet_mentor_cp, sheet = final_worksheet)
write_sheet(cohort_selection, ss = signup_sheet_mentor_cp, sheet = final_worksheet)