-
Notifications
You must be signed in to change notification settings - Fork 0
/
speech_ratio.r
93 lines (86 loc) · 2.88 KB
/
speech_ratio.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
library("tidyverse")
WORKS <- tribble(
~work, ~work_name,
"Argon.", "Argonautica",
"Callim.Hymn", "Callimachus’ Hymns",
"Dion.", "Nonnus’ Dionysiaca",
"Hom.Hymn", "Homeric Hymns",
"Il.", "Iliad",
"Od.", "Odyssey",
"Phaen.", "Aratus’ Phaenomena",
"Q.S.", "Quintus of Smyrna’s Fall of Troy",
"Sh.", "Shield",
"Theoc.", "Theocritus’ Idylls",
"Theog.", "Theogony",
"W.D.", "Works and Days",
"total", "Total"
)
data <- read_csv("sedes/joined.all.speaker.csv",
col_types = cols_only(
work = col_factor(),
book_n = col_character(),
line_n = col_character(),
word_n = col_integer(),
is_speech = col_factor()
)
) %>%
mutate(across(c(is_speech), ~ recode(.x, "Yes" = TRUE, "No" = FALSE))) %>%
# Add an index to the original lines, in order to restore original
# ordering after summarization. This also disambiguates cases of
# duplicate line numbers: we consider it a line break whenever word_n
# does not increase--otherwise all the words in the lines with repeated
# line numbers would be considered part of the same line.
mutate(idx = cumsum(replace_na(
!(work == lag(work) & book_n == lag(book_n) & line_n == lag(line_n) & word_n > lag(word_n)),
TRUE))) %>%
group_by(idx) %>%
summarize(
across(c(work, book_n, line_n), first),
is_speech = any(is_speech),
.groups = "drop"
) %>%
count(work, is_speech) %>%
pivot_wider(id_cols = work, names_from = is_speech, values_from = n)
# Manually fill in total for the two works that are not covered by DICES.
# Stephen Sansom writes:
#
# All speech in Aratus is narrator speech.
# Here are the details of speech for the Shield:
# Character Speech Narrator Speech
# ------------------ ---------------
# Heracles 46
# Iolaos 12
# Athena 15
# Total 73 407
#
# The total for Sh. is 480, which is different from the 479 that we compute
# ourselves. We'll keep the character speech lines the same, and subtract 1
# from the narrator speech.
data <- data %>%
filter(!(work %in% c("Phaen.", "Sh."))) %>%
bind_rows(tribble(
~work, ~`NA`, ~`TRUE`, ~`FALSE`,
"Phaen.", 0, 0, 1155,
"Sh.", 0, 73, 406
))
data <- data %>%
arrange(desc(`TRUE` / mapply(sum, `FALSE`, `TRUE`, `NA`, na.rm = TRUE)))
data <- data %>%
bind_rows(data %>%
summarize(
across(c(`NA`, `FALSE`, `TRUE`), sum, na.rm = TRUE),
work = "total"
)
) %>%
mutate(percent_speech = sprintf("%4.1f%%", 100 * `TRUE` / mapply(sum, `FALSE`, `TRUE`, `NA`, na.rm = TRUE))) %>%
mutate(`NA` = NULL) %>%
print() %>%
left_join(WORKS, by = c("work")) %>%
transmute(
`Work` = work_name,
`Character Speech` = scales::comma(`TRUE`, accuracy = 1),
`character %` = sprintf("%.1f%%", 100 * `TRUE` / (`FALSE` + `TRUE`)),
`Narrator` = scales::comma(`FALSE`, accuracy = 1),
`narrator %` = sprintf("%.1f%%", 100 * `FALSE` / (`FALSE` + `TRUE`))
) %>%
write_csv("speech_ratio.csv")