Skip to content

Commit 54520df

Browse files
committed
Add pre-filtering of wiktextract data files
1 parent 99cedbe commit 54520df

File tree

3 files changed

+52
-45
lines changed

3 files changed

+52
-45
lines changed

.vscode/c_cpp_properties.json

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
{
2-
"version": 4,
3-
"configurations": [
4-
{
5-
"name": "Linux",
6-
"intelliSenseMode": "linux-clang-x64",
7-
"compileCommands": "${workspaceFolder}/build/debug/compile_commands.json",
8-
"defines": [],
9-
"compilerPath": "/usr/bin/clang",
10-
"cStandard": "c17",
11-
"cppStandard": "c++20",
12-
"configurationProvider": "ms-vscode.cmake-tools"
13-
}
14-
]
15-
}
2+
"version": 4,
3+
"configurations": [
4+
{
5+
"name": "Linux",
6+
"intelliSenseMode": "linux-clang-x64",
7+
"compileCommands": "${workspaceFolder}/build/release/compile_commands.json",
8+
"defines": [ ],
9+
"compilerPath": "/usr/bin/clang",
10+
"cStandard": "c17",
11+
"cppStandard": "c++20",
12+
"configurationProvider": "ms-vscode.cmake-tools"
13+
}
14+
]
15+
}

data/corpusdata-config.json

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,35 +6,43 @@
66
"sources": {
77
"en": {
88
"url": "https://kaikki.org/dictionary/English/words/kaikki.org-dictionary-English-words.json",
9-
"file": "kaikki.org-dictionary-English-words.json"
9+
"originalFile": "kaikki.org-dictionary-English-words.json",
10+
"filteredFile": "kaikki.org-dictionary-English-words.jsonl"
1011
},
1112
"de": {
1213
"url": "https://kaikki.org/dictionary/German/words/kaikki.org-dictionary-German-words.json",
13-
"file": "kaikki.org-dictionary-German-words.json"
14+
"originalFile": "kaikki.org-dictionary-German-words.json",
15+
"filteredFile": "kaikki.org-dictionary-German-words.jsonl"
1416
},
1517
"fr": {
1618
"url": "https://kaikki.org/dictionary/French/words/kaikki.org-dictionary-French-words.json",
17-
"file": "kaikki.org-dictionary-French-words.json"
19+
"originalFile": "kaikki.org-dictionary-French-words.json",
20+
"filteredFile": "kaikki.org-dictionary-French-words.jsonl"
1821
},
1922
"it": {
2023
"url": "https://kaikki.org/dictionary/Italian/words/kaikki.org-dictionary-Italian-words.json",
21-
"file": "kaikki.org-dictionary-Italian-words.json"
24+
"originalFile": "kaikki.org-dictionary-Italian-words.json",
25+
"filteredFile": "kaikki.org-dictionary-Italian-words.jsonl"
2226
},
2327
"es": {
2428
"url": "https://kaikki.org/dictionary/Spanish/words/kaikki.org-dictionary-Spanish-words.json",
25-
"file": "kaikki.org-dictionary-Spanish-words.json"
29+
"originalFile": "kaikki.org-dictionary-Spanish-words.json",
30+
"filteredFile": "kaikki.org-dictionary-Spanish-words.jsonl"
2631
},
2732
"pt": {
2833
"url": "https://kaikki.org/dictionary/Portuguese/words/kaikki.org-dictionary-Portuguese-words.json",
29-
"file": "kaikki.org-dictionary-Portuguese-words.json"
34+
"originalFile": "kaikki.org-dictionary-Portuguese-words.json",
35+
"filteredFile": "kaikki.org-dictionary-Portuguese-words.jsonl"
3036
},
3137
"pl": {
3238
"url": "https://kaikki.org/dictionary/Polish/words/kaikki.org-dictionary-Polish-words.json",
33-
"file": "kaikki.org-dictionary-Polish-words.json"
39+
"originalFile": "kaikki.org-dictionary-Polish-words.json",
40+
"filteredFile": "kaikki.org-dictionary-Polish-words.jsonl"
3441
},
3542
"ru": {
3643
"url": "https://kaikki.org/dictionary/Russian/words/kaikki.org-dictionary-Russian-words.json",
37-
"file": "kaikki.org-dictionary-Russian-words.json"
44+
"originalFile": "kaikki.org-dictionary-Russian-words.json",
45+
"filteredFile": "kaikki.org-dictionary-Russian-words.jsonl"
3846
}
3947
}
4048
},

utils/devtools/corpusdata.py

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,10 @@ def __init__(self) -> None:
3636
self.sources: dict[str, self.SourceInfo] = {}
3737

3838
class SourceInfo:
39-
def __init__(self, url: str, file: str) -> None:
39+
def __init__(self, url: str, original_file: str, filtered_file: str) -> None:
4040
self.url: str = url
41-
self.file: str = file
41+
self.original_file: str = original_file
42+
self.filtered_file: str = filtered_file
4243

4344

4445
class GoogleNgramCorpusConfig:
@@ -73,9 +74,10 @@ def parse_json_to_corpus_config(file_path) -> CorpusConfig:
7374
if sources:
7475
for source_name, source_info in sources.items():
7576
url = source_info.get("url")
76-
file = source_info.get("file")
77-
if url and file:
78-
source = WiktextractCorpusConfig.SourceInfo(url, file)
77+
original_file = source_info.get("originalFile")
78+
filtered_file = source_info.get("filteredFile")
79+
if url and original_file and filtered_file:
80+
source = WiktextractCorpusConfig.SourceInfo(url, original_file, filtered_file)
7981
wiktextract_config.sources[source_name] = source
8082

8183
# Parse googlengram data
@@ -115,37 +117,34 @@ def filter_json_data(corpus_config: CorpusConfig, json_data):
115117
return filtered_json_data
116118

117119

118-
def filter_wiktextract_file(corpus_config: CorpusConfig, kaikki_path: str) -> None:
119-
kaikki_tmp_path = f"{kaikki_path}.tmp"
120+
def filter_wiktextract_file(corpus_config: CorpusConfig, kaikki_path: str, filtered_kaikki_path: str) -> None:
120121
with open(kaikki_path, "r") as kaikki_file:
121-
with open(kaikki_tmp_path, "w") as kaikki_tmp_file:
122+
with open(filtered_kaikki_path, "w") as filtered_kaikki_file:
122123
for line in kaikki_file:
123124
json_data = json.loads(line)
124125
filtered_json_data = filter_json_data(corpus_config, json_data)
125-
kaikki_tmp_file.write(json.dumps(filtered_json_data))
126-
kaikki_tmp_file.write("\n")
127-
os.remove(kaikki_path)
128-
os.rename(kaikki_tmp_path, kaikki_path)
126+
filtered_kaikki_file.write(json.dumps(filtered_json_data))
127+
filtered_kaikki_file.write("\n")
129128

130129

131130
def download_wiktextract(corpus_config: CorpusConfig, dst_dir: str) -> int:
132131
flutils.print_header("DOWNLOAD WIKTEXTRACT")
133132
for lang_tag, source in corpus_config.wiktextract.sources.items():
134133
print(f"[{lang_tag}]")
135-
kaikki_path = os.path.join(dst_dir, source.file)
134+
kaikki_path = os.path.join(dst_dir, source.original_file)
135+
filtered_kaikki_path = os.path.join(dst_dir, source.filtered_file)
136136
if os.path.exists(kaikki_path):
137137
print(f"Skip {kaikki_path} (already exists)")
138-
flutils.print_separator()
139-
continue
140-
print(f"Download {kaikki_path}")
141-
ret_code = flutils.download(url=source.url, to_file=kaikki_path)
142-
if ret_code != 0:
143-
print(f"WARN: Failed to complete download (error code {ret_code})")
144-
os.remove(kaikki_path)
145-
flutils.print_separator()
146-
continue
138+
else:
139+
print(f"Download {kaikki_path}")
140+
ret_code = flutils.download(url=source.url, to_file=kaikki_path)
141+
if ret_code != 0:
142+
print(f"WARN: Failed to complete download (error code {ret_code})")
143+
os.remove(kaikki_path)
144+
flutils.print_separator()
145+
continue
147146
print(f"Filtering {kaikki_path}")
148-
filter_wiktextract_file(corpus_config, kaikki_path)
147+
filter_wiktextract_file(corpus_config, kaikki_path, filtered_kaikki_path)
149148
flutils.print_separator()
150149

151150
return os.EX_OK

0 commit comments

Comments
 (0)