-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpre-processing.R
107 lines (91 loc) · 3.93 KB
/
pre-processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
library(tidyverse)
vrfiles <- dir("ocr-text", pattern = "^vrb", full.names = TRUE)
# Remember to edit the pattern to grep based on the updated, tagged OCR file.
inpfiles <- grep("(598|599|600|601|602|603|604|605|606|607|608|609|610|611|612|613)\\.txt", vrfiles, perl = TRUE, value = TRUE)
inpfiles
pages <- str_extract(inpfiles, "(?<=_)[0-9]{3}(?=\\.txt)")
# read the xml-tagged texts
# txt <- read_lines(inpfiles) |>
# str_subset("\\<(gloss|lang|term)\\b")
txt <- inpfiles |>
map(read_lines) |>
map(str_subset, "\\<(gloss|lang|term)\\b") |>
map(str_split, "\\s+(?=\\<)") |>
map(unlist)
# txt <- txt |>
# str_split("\\s+(?=\\<)") |>
# unlist()
# process the languages ====
# lang <- str_subset(txt, "\\<lang") |>
# str_extract("(?<=\\>)[^<]+?(?=\\<\\/lang\\>)") |>
# unique() |>
# (\(x) tibble(lang = x))() |>
# mutate(ID = row_number())
lang <- txt |>
map(str_subset, "\\<lang") |>
map(str_extract, "(?<=\\>)[^<]+?(?=\\<\\/lang\\>)") |>
unlist() |>
unique() |>
(\(x) tibble(lang = x))() |>
mutate(ID = row_number())
lang
lang_grp <- tribble(~ID, ~Group,
1, "Sumātra",
2, "Sumātra",
3, "Inselgruppen weftlich von Sumātra",
4, "Inselgruppen weftlich von Sumātra",
5, "Inselgruppen weftlich von Sumātra",
6, "Inselgruppen weftlich von Sumātra",
7, "Selēbes",
8, "Selēbes",
9, "Aru-Inseln",
10, "Aru-Inseln",
11, "Aru-Inseln",
12, "Südofter-Inseln",
13, "Südofter-Inseln",
14, "Südofter-Inseln",
15, "Südofter-Inseln",
16, "Südofter-Inseln",
17, "Neuguinea",
18, "Neuguinea",
19, "Neuguinea",
20, "Neuguinea",
21, "Neuguinea")
lang <- lang |> left_join(lang_grp)
lang_vct <- unique(lang$lang)
# extract elements into tibble ====
names(txt) <- pages
pattern_to_extract <- "((?<=target\\=\")([^\"]+?)(?=\")|(?<=xml\\:lang\\=\")([^\"]+?)(?=\")|(?<=\\>)([^<]+)(?=\\<)|(?<=change\\=\")([^\"]+?)(?=\"\\>))"
lang_term_gloss <- txt |>
map(str_subset, "\\<gloss\\b") |>
map(str_extract_all, pattern_to_extract, simplify = TRUE) |>
map(as_tibble, .name_repair = "unique") %>%
map2(pages, ., ~mutate(.y, pp = .x)) |>
list_rbind() |>
# lang_term_gloss <- str_subset(txt, "\\<gloss\\b") |>
# str_extract_all("((?<=target\\=\")([^\"]+?)(?=\")|(?<=xml\\:lang\\=\")([^\"]+?)(?=\")|(?<=\\>)([^<]+)(?=\\<)|(?<=change\\=\")([^\"]+?)(?=\"\\>))",
# simplify = TRUE) |>
# as_tibble(.name_repair = "unique") |>
rename(lang = `...1`,
german = `...2`,
form_orig = `...3`,
form_change = `...4`) |>
left_join(select(lang, -ID)) |>
mutate(lang = factor(lang, levels = lang_vct)) |>
arrange(pp, german, lang) |>
mutate(form_change = replace_na(form_change, "")) |>
mutate(forms = if_else(form_change == "", form_orig, form_change)) |>
select(Pages = pp, Language = lang, LanguageGroup = Group, German = german, Forms = forms, OldFormOrig = form_orig, OldFormChange = form_change)
# handling the English translation of the German gloss
the_German <- read_lines("data/German_Gloss.txt")
the_English <- read_lines("data/English_Translation") # From DeepL translator
the_Indonesian <- read_lines("data/Indonesian_Translation") # From DeepL translator (German to Indonesian)
the_Gloss <- tibble(German = the_German, English = the_English, Indonesian = the_Indonesian)
# re-run this everytime a new page gets updated with tagging.
lang_term_gloss |>
# join the Gloss
left_join(the_Gloss) |>
relocate(English, .after = German) |>
relocate(Indonesian, .after = English) |>
# relocate(Forms, .before = German) |>
write_tsv("data/vrosenberg1878.tsv")