-
Notifications
You must be signed in to change notification settings - Fork 1
/
6-CalculatePollsterRatings.Rmd
143 lines (122 loc) · 5.04 KB
/
6-CalculatePollsterRatings.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
---
title: "Pindograma Pollster Rating Calculator"
author: "Daniel T. Ferreira"
output: html_document
---
<!--
This file is (c) 2020 CincoNoveSeis Jornalismo Ltda.
It is licensed under the GNU General Public License, version 3.
-->
First, we import some libraries:
```{r}
library(tidyverse)
library(fastDummies)
```
Let's start by defining some helpers:
```{r}
pop.var <- function(x) var(x) * (length(x)-1) / length(x)
pop.sd <- function(x) sqrt(pop.var(x))
rpm_mean = function(cur_cnpj, cur_mm3, mm3, cnpj) {
cur_mm3 - mean(mm3[cnpj != cur_cnpj])
}
```
Now, let's import some useful data:
```{r}
reg_conre = read_csv('data/manual-data/reg_conre_atualizado.csv') %>%
mutate(cnpj = str_replace_all(cnpj, '[\\.\\/\\-]', ''))
reg_abep = read_csv('data/manual-data/reg_abep.csv')
company_names = read_csv('data/manual-data/nomes_empresas.csv')
company_corr = read_csv('data/manual-data/empresas_multiplos_cnpjs.csv')
late_polls = read_csv('output/late_polls.csv')
```
```{r}
model_polls = late_polls %>%
filter(pct != 0) %>%
mutate(p_pct = pct/100, p_valid_result = valid_result/100) %>%
group_by(year, turno,
NR_IDENTIFICACAO_PESQUISA, NR_CNPJ_EMPRESA, company_id, pretty_name,
SG_UE, polled_UE, CD_CARGO, estimulada, vv, scenario_id, source,
DT_FIM_PESQUISA, self_hired, is_phone, is_fluxo, QT_ENTREVISTADOS,
non_partisan, first_round_date, second_round_date, is_complete, state) %>%
arrange(desc(valid_result), .by_group = T) %>%
summarize(mm3 = mean(abs(valid_result - pct)),
stdev = pop.sd(valid_result - pct),
arzev_bw = weighted.mean(abs(log((p_pct / (1 - p_pct)) * ((1 - p_valid_result) / p_valid_result))), valid_result),
undecided = first(undecided)) %>%
ungroup() %>%
mutate(election_type = case_when(
polled_UE == 'BR' ~ 1,
CD_CARGO == 1 & polled_UE != 'BR' ~ 2,
CD_CARGO == 3 & nchar(polled_UE) == 2 ~ 3,
CD_CARGO == 11 ~ 4,
T ~ NA_real_
)) %>%
filter(!is.na(election_type)) %>%
mutate(days_apart = as.numeric(
abs(difftime(DT_FIM_PESQUISA, if_else(turno == 1, first_round_date, second_round_date), units = 'days'))
)) %>%
mutate(one_week_prior = days_apart <= 7) %>%
filter((turno == 1 & days_apart <= 21) | (turno == 2 & days_apart <= 14)) %>%
group_by(year, turno, NR_IDENTIFICACAO_PESQUISA, NR_CNPJ_EMPRESA, SG_UE, polled_UE, CD_CARGO, estimulada, vv) %>%
filter(n_distinct(scenario_id) == 1) %>%
ungroup() %>%
filter(is_complete) %>%
dummy_cols(select_columns = c('election_type')) %>%
dummy_cols(select_columns = c('year')) %>%
mutate(first_round = turno == 1) %>%
mutate(n_adj = 1 / sqrt(pmin(QT_ENTREVISTADOS, 6000))) %>%
dummy_cols(select_columns = c('state'))
fit = lm(mm3 ~ undecided + days_apart + n_adj + election_type_3 + election_type_2 +
state_AC + state_AL + state_BA + state_CE + state_ES + state_DF + state_AP +
state_GO + state_MA + state_MG + state_MS + state_MT + state_PA + state_PB +
state_PE + state_PI + state_PR + state_RJ + state_RN + state_RO + state_RR +
state_RS + state_SC + state_SE + state_SP + state_TO + first_round,
data = model_polls)
model_polls_2 = model_polls %>%
mutate(simple_plus_minus = mm3 - predict(fit, newdata = .)) %>%
group_by(year, turno, CD_CARGO, polled_UE, one_week_prior) %>%
mutate(n_empresas_cmp = n_distinct(company_id) - 1) %>%
mutate(relative_plus_minus = ifelse(
n_empresas_cmp != 0,
map2_dbl(company_id, mm3, rpm_mean, mm3, company_id),
0
)) %>%
ungroup() %>%
mutate(weighted_pm = ifelse(
relative_plus_minus == 0,
simple_plus_minus,
((n_empresas_cmp * relative_plus_minus) + (3 * simple_plus_minus)) / (n_empresas_cmp + 3)
)) %>%
filter(!is.na(weighted_pm)) # NOTE: This is due to the late Macapá 2020 election
simple_rating_0 = model_polls_2 %>%
mutate(joinid = row_number()) %>%
left_join(reg_conre, by = c('company_id' = 'cnpj')) %>%
distinct(joinid, .keep_all = T) %>%
mutate(has_conre = !is.na(conre)) %>%
left_join(reg_abep, by = c('company_id' = 'cnpj')) %>%
mutate(has_abep = !is.na(abep)) %>%
select(-joinid, -conre, -abep) %>%
group_by(pretty_name, company_id, has_abep, has_conre) %>%
summarize(avg = mean(mm3),
spm = mean(simple_plus_minus),
weighted_pm = mean(weighted_pm),
n = n()) %>%
ungroup()
conre_abep_mean = simple_rating_0 %>%
filter(has_conre | has_abep) %>%
pull(weighted_pm) %>%
mean()
non_conre_abep_mean = simple_rating_0 %>%
filter(!(has_conre | has_abep)) %>%
pull(weighted_pm) %>%
mean()
simple_rating = simple_rating_0 %>%
mutate(rev_mean = ifelse(has_abep | has_conre, conre_abep_mean, non_conre_abep_mean)) %>%
mutate(pred_pm = rev_mean + (weighted_pm - rev_mean) * (n / (n + 15))) %>%
mutate_if(is.double, round, 3) %>%
filter(n >= 5) %>%
mutate(quant_bin = .bincode(pred_pm, c(-1, -0.57, 0, 0.5, 1.05, 1.54, 3))) %>%
mutate(grade = recode(quant_bin, `1` = 'A', `2` = 'B+', `3` = 'B', `4` = 'B-', `5` = 'C', `6` = 'D'))
simple_rating %>%
write.csv('output/pollster_rating_2020_final.csv', row.names = F)
```