-
Notifications
You must be signed in to change notification settings - Fork 1
/
build.rs
138 lines (120 loc) · 4.32 KB
/
build.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
use flate2::read::GzDecoder;
use serde::Serialize;
use serde_json::json;
use std::{collections::HashMap, env, fs, io::Read, path::Path};
type Dictionary = HashMap<String, Vec<Entry>>;
#[derive(Clone, PartialEq, Serialize)]
pub struct Entry {
pub kanji: String,
pub reading: String,
pub meanings: Vec<String>,
pub frequency: i32,
}
fn upsert(dictionary: &mut Dictionary, key: String, entry: &Entry) {
if let Some(entries) = dictionary.get_mut(&key) {
if !entries.contains(entry) {
entries.push(entry.clone());
}
} else {
dictionary.insert(key, vec![entry.clone()]);
}
}
// JMdict often adds parenthetical explanations to its English meanings. We need
// to cut them down to bare headwords.
// FIXME: There are a number of meanings in JMdict that *begin* with parentheses.
// These get lost via this process.
fn trim_explanation(meaning: &str) -> &str {
if meaning.ends_with(')') {
if let Some(open_parenthesis) = meaning.find('(') {
return meaning[..open_parenthesis].trim();
}
}
meaning
}
fn read_dictionary() -> (Dictionary, Dictionary, Dictionary, String) {
let mut j2e = HashMap::new();
let mut e2j = HashMap::new();
let mut reading = HashMap::new();
let mut version = String::from("unknown");
let mut gz = GzDecoder::new(fs::File::open("./JMdict_e.gz").unwrap());
let mut xml = String::new();
gz.read_to_string(&mut xml).unwrap();
let opt = roxmltree::ParsingOptions {
allow_dtd: true,
..roxmltree::ParsingOptions::default()
};
let doc = match roxmltree::Document::parse_with_options(&xml, opt) {
Ok(doc) => doc,
Err(e) => {
println!("Error: {}", e);
return (j2e, e2j, reading, version);
}
};
if let Some(comment_node) = doc
.descendants()
.find(|n| n.is_comment() && n.text().unwrap().starts_with(" JMdict created: "))
{
// Version comment has the format "JMdict created: 2024-07-15"
let comment = comment_node.text().unwrap();
if let Some((_, version_str)) = comment.split_once(": ") {
version = version_str.trim().to_string();
}
}
for node in doc.descendants().filter(|n| n.has_tag_name("entry")) {
let keb = match node.descendants().find(|n| n.has_tag_name("keb")) {
Some(e) => e.text().unwrap(),
None => "",
};
let reb = match node.descendants().find(|n| n.has_tag_name("reb")) {
Some(e) => e.text().unwrap(),
None => continue,
};
let nf = match node
.descendants()
.find(|n| n.has_tag_name("re_pri") && n.text().unwrap().starts_with("nf"))
{
Some(e) => e.text().unwrap(),
None => "",
};
let glosses = node
.descendants()
.filter(|n| n.has_tag_name("gloss"))
.map(|n| n.text().unwrap().to_string())
.collect();
let entry = Entry {
kanji: keb.to_string(),
reading: reb.to_string(),
meanings: glosses,
frequency: if !nf.is_empty() {
nf[2..].parse().unwrap_or(999)
} else {
999
},
};
if !keb.is_empty() {
upsert(&mut j2e, keb.to_string(), &entry);
}
for meaning in &entry.meanings {
let headword = trim_explanation(meaning).to_lowercase();
upsert(&mut e2j, headword, &entry);
}
upsert(&mut reading, reb.to_string(), &entry);
}
(j2e, e2j, reading, version)
}
fn main() {
let (j2e, e2j, reading, version) = read_dictionary();
let out_dir = env::var_os("OUT_DIR").unwrap();
let j2e_path = Path::new(&out_dir).join("j2e.json");
let j2e_json = json!(j2e);
fs::write(j2e_path, j2e_json.to_string()).unwrap();
let e2j_path = Path::new(&out_dir).join("e2j.json");
let e2j_json = json!(e2j);
fs::write(e2j_path, e2j_json.to_string()).unwrap();
let reading_path = Path::new(&out_dir).join("reading.json");
let reading_json = json!(reading);
fs::write(reading_path, reading_json.to_string()).unwrap();
let version_path = Path::new(&out_dir).join("jmdict_version");
fs::write(version_path, version).unwrap();
println!("cargo:rerun-if-changed=JMdict_e.gz");
}