-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_2020_2.R
310 lines (273 loc) · 15.6 KB
/
parse_2020_2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# parse_2020_2.R
# (c) 2020 CincoNoveSeis Jornalismo Ltda.
# MUST VERIFY:
# - multiple scenarios
# - total and relation to vv
library(tidyverse)
library(stringi)
library(lubridate)
library(mgsub)
library(fastdigest)
library(sqldf)
options('sqldf.dll' = paste0(getwd(), '/', 'spellfix.so'))
Rcpp::sourceCpp('src/wordmatch.cpp')
source('polling_utils.R')
source('poll_to_candidate_matcher.R')
source('polls_registry.R')
cands_ = read_csv2('data/tse/consulta_cand_2020_BRASIL.csv', locale = locale(encoding = 'ISO-8859-1')) %>%
filter(CD_CARGO == 11)
cands_2 = cands_ %>%
rename(NOME_CANDIDATO = NM_CANDIDATO, NOME_URNA_CANDIDATO = NM_URNA_CANDIDATO,
CODIGO_CARGO = CD_CARGO, SIGLA_UE = SG_UE, NUMERO_CANDIDATO = NR_CANDIDATO,
NUM_TURNO = NR_TURNO) %>%
select(NOME_CANDIDATO, NOME_URNA_CANDIDATO, CODIGO_CARGO, SIGLA_UE, NUMERO_CANDIDATO, NUM_TURNO, ANO_ELEICAO) %>%
mutate(NOME_CANDIDATO = normalize_cand(NOME_CANDIDATO)) %>%
mutate(NOME_URNA_CANDIDATO = normalize_cand(NOME_URNA_CANDIDATO)) %>%
mutate(CODIGO_CARGO = ifelse(CODIGO_CARGO == 8, 7, CODIGO_CARGO)) %>%
mutate(SIGLA_UF = NA) %>%
distinct() %>%
distinct(NOME_CANDIDATO, NOME_URNA_CANDIDATO, .keep_all = T)
cands = bind_rows(cands_2, cands_2) %>%
mutate(NUM_TURNO = c(rep(1, nrow(cands_2)), rep(2, nrow(cands_2))))
rm(cands_, cands_2)
estatisticos_ids = read_csv('data/manual-data/estatisticos_ids.csv')
df = load_poll_registry_data(estatisticos_ids = estatisticos_ids, old = F)
df_for_merge_prelim = get_poll_registry_for_merge(df)
df_for_merge = df_for_merge_prelim
leva1 = read_csv('data/manual-data/manual-2020/pedro-fixed-pedro_leva1_2020_orig.csv', col_types = rtypes)
leva2 = read_csv('data/manual-data/manual-2020/pedro-fixed-pedro_leva2_2020_orig.csv', col_types = rtypes)
leva3 = read_csv('data/manual-data/manual-2020/pedro_leva3_2020.csv', col_types = rtypes)
leva3_ex = read_csv('data/manual-data/manual-2020/pedro_leva_3_extra_.csv', col_types = rtypes)
leva4 = read_csv('data/manual-data/manual-2020/pedro_leva4_2020.csv', col_types = rtypes)
leva5 = read_csv('data/manual-data/manual-2020/pedro_leva5_2020.csv', col_types = rtypes)
leva6 = read_csv('data/manual-data/manual-2020/pedro_leva6_2020.csv', col_types = rtypes)
leva7 = read_csv('data/manual-data/manual-2020/pedro_leva7_2020.csv', col_types = rtypes)
leva8 = read_csv('data/manual-data/manual-2020/pedro_leva8_2020.csv', col_types = rtypes)
leva9 = read_csv('data/manual-data/manual-2020/pedro_leva9_2020.csv', col_types = rtypes)
leva10 = read_csv('data/manual-data/manual-2020/pedro_leva10_2020.csv', col_types = rtypes)
leva11 = read_csv('data/manual-data/manual-2020/leva11_2020.csv', col_types = rtypes)
leva11_ex = read_csv('data/manual-data/manual-2020/pedro_review_zeros.csv', col_types = rtypes)
leva12_ex = read_csv('data/manual-data/manual-2020/pedro_leva12_extra.csv', col_types = rtypes)
leva12_ex2 = read_csv('data/manual-data/manual-2020/pedro_leva12_extra2.csv', col_types = rtypes)
leva13 = read_csv('data/manual-data/manual-2020/pedro_leva13.csv', col_types = rtypes)
leva14 = read_csv('data/manual-data/manual-2020/pedro_leva14_2020.csv', col_types = rtypes)
leva15 = read_csv('data/manual-data/manual-2020/pedro_leva15_2020.csv', col_types = rtypes)
leva16 = read_csv('data/manual-data/manual-2020/pedro_leva16_2020.csv', col_types = rtypes)
leva17 = read_csv('data/manual-data/manual-2020/pedro_leva17_2020.csv', col_types = rtypes)
leva18 = read_csv('data/manual-data/manual-2020/pedro_leva18_2020.csv', col_types = rtypes)
leva19 = read_csv('data/manual-data/manual-2020/pedro_leva19_2020.csv', col_types = rtypes)
leva19_ex = read_csv('data/manual-data/manual-2020/pedro_leva19_2020_extra.csv', col_types = rtypes)
leva20 = read_csv('data/manual-data/manual-2020/pedro_leva20_2020.csv', col_types = rtypes)
leva21 = read_csv('data/manual-data/manual-2020/pedro_leva21_2020_part1.csv', col_types = rtypes)
leva22 = read_csv('data/manual-data/manual-2020/pedro_leva22_2020.csv', col_types = rtypes)
leva22_bizarre = read_csv('data/manual-data/manual-2020/pedro_leva22_2020_bizarre.csv', col_types = rtypes)
leva23 = read_csv('data/manual-data/manual-2020/pedro_leva23_2020.csv', col_types = rtypes) %>%
mutate(util = ifelse(is.na(util), 0, util))
leva23_extra = read_csv('data/manual-data/manual-2020/pedro_leva23_extra_2020.csv', col_types = rtypes)
leva24 = read_csv('data/manual-data/manual-2020/pedro_leva24_2020.csv', col_types = rtypes)
leva24_extra = read_csv('data/manual-data/manual-2020/pedro_2020_patch1.csv', col_types = rtypes)
leva25 = read_csv('data/manual-data/manual-2020/pedro_leva25_2020.csv', col_types = rtypes)
leva26 = read_csv('data/manual-data/manual-2020/pedro_leva26_2020.csv', col_types = rtypes)
leva27 = read_csv('data/manual-data/manual-2020/pedro_leva27_2020.csv', col_types = rtypes)
leva28 = read_csv('data/manual-data/manual-2020/pedro_leva28_2020.csv', col_types = rtypes)
leva30 = read_csv('data/manual-data/manual-2020/pedro_leva30_2020.csv', col_types = rtypes)
leva31_ex = read_csv('data/manual-data/manual-2020/pedro_leva31_extra_2020.csv', col_types = rtypes)
leva31 = read_csv('data/manual-data/manual-2020/pedro_leva31_2020.csv', col_types = rtypes)
leva32 = read_csv('data/manual-data/manual-2020/pedro_leva32_2020.csv', col_types = rtypes)
leva33 = read_csv('data/manual-data/manual-2020/pedro_leva33_2020.csv', col_types = rtypes)
leva34 = read_csv('data/manual-data/manual-2020/pedro_leva34_2020.csv', col_types = rtypes)
leva35 = read_csv('data/manual-data/manual-2020/pedro_leva35_2020.csv', col_types = rtypes)
leva36 = read_csv('data/manual-data/manual-2020/pedro_leva36_2020.csv', col_types = rtypes)
#leva37 = read_csv('data/manual-data/manual-2020/faltantes_2020.csv', col_types = rtypes)
X2020 = bind_rows(leva1, leva2, leva3, leva3_ex, leva4, leva5, leva6, leva7, leva8, leva9,
leva10, leva11, leva11_ex, leva12_ex, leva12_ex2, leva13, leva14, leva15,
leva16, leva17, leva18, leva19, leva19_ex, leva20, leva21, leva22,
leva22_bizarre, leva23, leva23_extra, leva24, leva24_extra, leva25,
leva26, leva27, leva28, leva30, leva31_ex, leva31, leva32, leva33,
leva34, leva35, leva36)#, leva37)
X2020_2 = normalize_input(X2020)
cur_0 = X2020_2 %>%
mutate(scenario_id = row_number()) %>%
rowwise() %>%
mutate(total = sum(c_across(contains('resul')), na.rm = T)) %>%
mutate(for_patch = (total > 98 & is.na(vv)) | (total < 98 & !is.na(vv)) | total > 102) %>%
ungroup()
patch_1 = read_csv('data/manual-data/manual-2020/patch_1_2020.csv') %>%
mutate(year = 2020)
patch_2 = read_csv('data/manual-data/manual-2020/pedro_patch2_2020.csv', col_types = rtypes) %>%
select(tse_id:resul15, NM_UE.x) %>%
rename(NM_UE = NM_UE.x) %>%
normalize_input() %>%
mutate(scenario_id = row_number() + 10000)
patch_3 = read_csv('data/manual-data/manual-2020/pedro_patch3_2020.csv', col_types = rtypes) %>%
select(tse_id:resul15, NM_UE) %>%
normalize_input() %>%
mutate(scenario_id = row_number() + 20000)
cur_1 = bind_rows(
cur_0 %>% filter(!for_patch),
patch_1,
patch_3
) %>%
mutate(scenario_id = row_number())
cur = bind_rows(
cur_1 %>% filter(!(tse_id %in% patch_2$tse_id)),
patch_2
)
lhs = cur %>%
pivot_longer(cols = starts_with('cand'),
names_to = 'index',
names_prefix = 'cand',
values_to = 'candidate',
values_drop_na = T) %>%
select(-matches('resul'), -matches('part'))
rhs = cur %>%
pivot_longer(cols = starts_with('resul'),
names_to = 'index',
names_prefix = 'resul',
values_to = 'result',
values_drop_na = T) %>%
select(-matches('cand'), -matches('part'))
manual = inner_join(lhs, rhs, by = c(
'index' = 'index',
'scenario_id' = 'scenario_id'
)) %>%
select(-matches('\\.y')) %>%
rename_at(vars(matches('\\.x')), function(x) { str_sub(x, end = -3) }) %>%
mutate(tse_id = ifelse(is.na(tse_id), id_pesq, tse_id)) %>%
mutate(NM_UE = ifelse(is.na(NM_UE), info_muni, NM_UE)) %>%
mutate(tse_id = str_replace(tse_id, '\\/2019', '\\/2020'))
manual_tse = manual %>%
inner_join(df_for_merge, by = c('tse_id' = 'f_id')) %>%
mutate(main_source = 'Pindograma-Manual') %>%
rename(source = url) %>%
mutate(CD_CARGO = recode(position,
`p` = 11,
`v` = 13,
))
p3_2020 = read_csv2('data/poder360/consulta_2020.csv') %>%
mutate(reg = str_replace_all(normalize_simple(num_registro), '[\\s\\.\\-\\/]', '')) %>%
anti_join(manual_tse, c('reg' = 'NR_IDENTIFICACAO_PESQUISA')) %>%
filter(cargo == 'Prefeito') %>%
mutate(reg = ifelse(reg == 'P042832020', 'SP042832020', reg)) %>%
mutate(cmp_ue = normalize_simple(cidade)) %>% left_join(election_dates, by = c('ano' = 'year')) %>%
mutate(turno_realizacao = ifelse(data_pesquisa <= first_round_date, 1, 2)) %>%
mutate(position = 'p') %>%
mutate(CD_CARGO = recode(position,
`p` = 11,
`pr` = 1,
`g` = 3,
`s` = 5
)) %>%
mutate(estimulada = tipo_id == 2) %>%
mutate(raw_cand = normalize_cand(candidato)) %>%
mutate(candidate_without_title = normalize_cand_rm_titles(raw_cand)) %>%
filter(turno_realizacao == turno) %>%
mutate(norm_cenario_desc = normalize_simple(cenario_descricao)) %>%
filter(tipo_id != 3 & !grepl('REJEICAO', norm_cenario_desc)) %>%
filter(ambito != 'RE') %>%
filter(percentual != 0) %>%
filter(condicao == 0 & !grepl('EM BRANCO|NULO|NENHUM|BASE|TOTAL|OUTROS|OUTRAS|NAO SABE|NAO RESPOND|RECUSA|NS\\/NR|CITOU OUTRO', raw_cand)) %>%
inner_join(df_for_merge, by = c('reg' = 'NR_IDENTIFICACAO_PESQUISA')) %>%
group_by(reg, estimulada, voto_tipo) %>%
filter(n_distinct(cenario_id) == 1) %>%
ungroup() %>%
mutate(main_source = 'Poder360', source = NA) %>%
rename(NR_IDENTIFICACAO_PESQUISA = reg, year = ano, candidate = raw_cand, result = percentual) %>%
mutate(result = str_to_dbl(result)) %>%
mutate(scenario_id = 50000 + row_number())
# NOTE: This classification is out-of-date. All `company_id`s defined here will be OVERWRITTEN
# in 5-SelectedPollsExport.Rmd for pollster rating purposes. For aggregation purposes, this
# is not yet done below, but we won't be updating these 2020 poll CSVs anyway, so it doesn't
# matter.
manual_p3_merged = bind_rows(manual_tse, p3_2020) %>%
mutate(company_id = case_when(
NR_CNPJ_EMPRESA %in% c('24776969000117', '07742623000189', '67662494000140', '14931054000185', '26195312000191') ~ 'REALIDADE',
NR_CNPJ_EMPRESA %in% c('23254436000102', '00852438000106', '00852501000104') ~ 'VOX POPULI',
NR_CNPJ_EMPRESA %in% c('05939922000263', '32208779000121') ~ 'GERP',
NR_CNPJ_EMPRESA %in% c('10828442000184', '22198794000182', '33539440000170') ~ 'MULTIPLA',
NR_CNPJ_EMPRESA %in% c('17070395000100', '06154093000195') ~ 'MULTIDADOS',
NR_CNPJ_EMPRESA %in% c('09656847000101', '09283689000183') ~ 'PROMIDIA',
NR_CNPJ_EMPRESA %in% c('03622028000159', '39795414000190') ~ 'FUTURA',
NR_CNPJ_EMPRESA %in% c('26968293000199', '11785871000184') ~ 'RANKING',
NR_CNPJ_EMPRESA %in% c('57541377000175', '16623147000178') ~ 'DGABC',
NR_CNPJ_EMPRESA %in% c('01077145000153', '10575983000148') ~ 'DATAMETRICA',
NR_CNPJ_EMPRESA %in% c('13776203000116', '12784563000105') ~ 'ARBEIT',
NR_CNPJ_EMPRESA %in% c('14263830000116', '16684996000131') ~ 'AGORASEI',
NR_CNPJ_EMPRESA %in% c('11509901000120', '20450146000146') ~ 'LONDON',
NR_CNPJ_EMPRESA %in% c('17110229000181', '29880121000157') ~ 'AMPLACE',
NR_CNPJ_EMPRESA %in% c('03397255000128', '33030852000180') ~ 'TDL',
NR_CNPJ_EMPRESA %in% c('04996040000196', '09439784000123') ~ 'ANGULO',
NR_CNPJ_EMPRESA %in% c('03490620000144', '32980640000100') ~ 'TENDENCIAMS',
NR_CNPJ_EMPRESA %in% c('36607622000120', '11535761000164') ~ 'SUDOESTE',
NR_CNPJ_EMPRESA %in% c('02291216000189', '01338700000153') ~ 'GAUSS',
NR_CNPJ_EMPRESA %in% c('22913911000142', '04216356000118') ~ 'ALVO',
NR_CNPJ_EMPRESA %in% c('05281052000105', '28158617000159', '00961694000123') ~ 'VOGA',
NR_CNPJ_EMPRESA %in% c('09415165000107', '73988917000110') ~ 'NILTON',
T ~ NR_CNPJ_EMPRESA
)) %>%
mutate(turno = ifelse(DT_FIM_PESQUISA <= make_date(2020, 11, 15), 1, 2)) %>%
mutate(candidate_without_title = normalize_cand_rm_titles(candidate)) %>%
mutate(candidate = ifelse(SG_UE == '90670' & grepl('ABILIO', candidate), 'ABILIO', candidate)) %>%
mutate(candidate = ifelse(SG_UE == '09210' & grepl('BIRA', candidate), 'BIRA', candidate))
corr = read_csv('data/manual-data/pedro_corr_cands_2020.csv') %>%
filter(wrong == 1)
manual_matches_ = match_polls_with_candidates(manual_p3_merged)
manual_matches = manual_matches_ %>%
anti_join(corr, by = c('candidate', 'NOME_CANDIDATO', 'NOME_URNA_CANDIDATO'))
company_names = read_csv('data/manual-data/nomes_empresas.csv')
all_polls = manual_matches %>%
distinct(year, NR_IDENTIFICACAO_PESQUISA, NR_CNPJ_EMPRESA, SG_UE, CD_CARGO, company_id, suspensa,
estimulada, NUMERO_CANDIDATO, NOME_URNA_CANDIDATO, result, DT_FIM_PESQUISA, vv, turno,
is_fluxo, is_phone, self_hired, QT_ENTREVISTADOS, main_source, source, scenario_id, is_complete,
DT_INICIO_PESQUISA, hirer, confidence_interval_final, error_final, candidate, partisan) %>%
left_join(company_names, 'company_id') %>%
mutate(polled_UE = SG_UE) %>%
group_by(NR_IDENTIFICACAO_PESQUISA, NR_CNPJ_EMPRESA, SG_UE, CD_CARGO, estimulada, scenario_id) %>%
mutate(undecided = 100 - sum(result)) %>%
ungroup()
all_polls_2 = all_polls %>%
group_by(NR_IDENTIFICACAO_PESQUISA, estimulada, CD_CARGO, scenario_id) %>%
filter(n_distinct(candidate) == n()) %>%
mutate(scenario_count = n()) %>%
ungroup() %>%
group_by(NR_IDENTIFICACAO_PESQUISA, estimulada, CD_CARGO, NR_CNPJ_EMPRESA) %>%
mutate(multiple_scenarios = n_distinct(scenario_id) > 1) %>%
filter(scenario_count == max(scenario_count)) %>%
ungroup() %>%
select(-scenario_count)
all_polls_3 = all_polls_2 %>%
group_by(NR_IDENTIFICACAO_PESQUISA, NR_CNPJ_EMPRESA, SG_UE, polled_UE, estimulada, CD_CARGO, vv, scenario_id) %>%
mutate(group_digest = fastdigest(cur_data() %>% arrange(NUMERO_CANDIDATO) %>% select(NUMERO_CANDIDATO, result))) %>%
ungroup() %>%
group_by(NR_IDENTIFICACAO_PESQUISA, NR_CNPJ_EMPRESA, SG_UE, polled_UE, estimulada, CD_CARGO, vv, group_digest) %>%
filter(scenario_id == first(scenario_id)) %>%
ungroup()
saveRDS(all_polls_3, 'all_polls_2020.rda')
no_vv_polls = all_polls_3 %>%
filter(is.na(vv)) %>%
filter(!(SG_UE == '02550' & NUMERO_CANDIDATO == 27 & NOME_URNA_CANDIDATO != 'CHICO PRETO')) %>%
group_by(NR_IDENTIFICACAO_PESQUISA, scenario_id, NUMERO_CANDIDATO) %>%
filter(all(n() == 1)) %>%
ungroup() %>%
select(-NOME_URNA_CANDIDATO)
early_polls_2 = no_vv_polls %>%
left_join(election_dates, 'year') %>%
filter(DT_FIM_PESQUISA <= candidate_registry_date) %>%
filter(estimulada == 0)
recent_polls = no_vv_polls %>%
left_join(election_dates, 'year') %>%
group_by(NR_IDENTIFICACAO_PESQUISA, NR_CNPJ_EMPRESA, SG_UE, polled_UE, CD_CARGO, vv) %>%
filter(n_distinct(estimulada) == 1 | estimulada == 1) %>%
ungroup() %>%
anti_join(early_polls_2, by = c('NR_IDENTIFICACAO_PESQUISA', 'scenario_id'))
early_polls_2 %>% write.csv('output/early_polls_2020_2.csv', row.names = F)
recent_polls %>% write.csv('output/recent_polls_2020.csv', row.names = F)
cands %>%
semi_join(all_polls_3, by = c('SIGLA_UE' = 'SG_UE', 'CODIGO_CARGO' = 'CD_CARGO', 'ANO_ELEICAO' = 'year')) %>%
filter(CODIGO_CARGO %in% c(1, 3, 11)) %>%
distinct(SIGLA_UE, NUM_TURNO, CODIGO_CARGO, ANO_ELEICAO, NUMERO_CANDIDATO, NOME_URNA_CANDIDATO) %>%
write.csv('output/pindograma_candlist_2020.csv', row.names = F)
df_for_merge %>%
distinct(SG_UF, SG_UE, str_to_title(NM_UE)) %>%
mutate(final_name = paste0(`str_to_title(NM_UE)`, ' (', SG_UF, ')')) %>%
write.csv('output/cities_2020.csv', row.names = F)