@@ -36,9 +36,10 @@ def __init__(self) -> None:
36
36
self .sources : dict [str , self .SourceInfo ] = {}
37
37
38
38
class SourceInfo :
39
- def __init__ (self , url : str , file : str ) -> None :
39
+ def __init__ (self , url : str , original_file : str , filtered_file : str ) -> None :
40
40
self .url : str = url
41
- self .file : str = file
41
+ self .original_file : str = original_file
42
+ self .filtered_file : str = filtered_file
42
43
43
44
44
45
class GoogleNgramCorpusConfig :
@@ -73,9 +74,10 @@ def parse_json_to_corpus_config(file_path) -> CorpusConfig:
73
74
if sources :
74
75
for source_name , source_info in sources .items ():
75
76
url = source_info .get ("url" )
76
- file = source_info .get ("file" )
77
- if url and file :
78
- source = WiktextractCorpusConfig .SourceInfo (url , file )
77
+ original_file = source_info .get ("originalFile" )
78
+ filtered_file = source_info .get ("filteredFile" )
79
+ if url and original_file and filtered_file :
80
+ source = WiktextractCorpusConfig .SourceInfo (url , original_file , filtered_file )
79
81
wiktextract_config .sources [source_name ] = source
80
82
81
83
# Parse googlengram data
@@ -115,37 +117,34 @@ def filter_json_data(corpus_config: CorpusConfig, json_data):
115
117
return filtered_json_data
116
118
117
119
118
- def filter_wiktextract_file (corpus_config : CorpusConfig , kaikki_path : str ) -> None :
119
- kaikki_tmp_path = f"{ kaikki_path } .tmp"
120
+ def filter_wiktextract_file (corpus_config : CorpusConfig , kaikki_path : str , filtered_kaikki_path : str ) -> None :
120
121
with open (kaikki_path , "r" ) as kaikki_file :
121
- with open (kaikki_tmp_path , "w" ) as kaikki_tmp_file :
122
+ with open (filtered_kaikki_path , "w" ) as filtered_kaikki_file :
122
123
for line in kaikki_file :
123
124
json_data = json .loads (line )
124
125
filtered_json_data = filter_json_data (corpus_config , json_data )
125
- kaikki_tmp_file .write (json .dumps (filtered_json_data ))
126
- kaikki_tmp_file .write ("\n " )
127
- os .remove (kaikki_path )
128
- os .rename (kaikki_tmp_path , kaikki_path )
126
+ filtered_kaikki_file .write (json .dumps (filtered_json_data ))
127
+ filtered_kaikki_file .write ("\n " )
129
128
130
129
131
130
def download_wiktextract (corpus_config : CorpusConfig , dst_dir : str ) -> int :
132
131
flutils .print_header ("DOWNLOAD WIKTEXTRACT" )
133
132
for lang_tag , source in corpus_config .wiktextract .sources .items ():
134
133
print (f"[{ lang_tag } ]" )
135
- kaikki_path = os .path .join (dst_dir , source .file )
134
+ kaikki_path = os .path .join (dst_dir , source .original_file )
135
+ filtered_kaikki_path = os .path .join (dst_dir , source .filtered_file )
136
136
if os .path .exists (kaikki_path ):
137
137
print (f"Skip { kaikki_path } (already exists)" )
138
- flutils .print_separator ()
139
- continue
140
- print (f"Download { kaikki_path } " )
141
- ret_code = flutils .download (url = source .url , to_file = kaikki_path )
142
- if ret_code != 0 :
143
- print (f"WARN: Failed to complete download (error code { ret_code } )" )
144
- os .remove (kaikki_path )
145
- flutils .print_separator ()
146
- continue
138
+ else :
139
+ print (f"Download { kaikki_path } " )
140
+ ret_code = flutils .download (url = source .url , to_file = kaikki_path )
141
+ if ret_code != 0 :
142
+ print (f"WARN: Failed to complete download (error code { ret_code } )" )
143
+ os .remove (kaikki_path )
144
+ flutils .print_separator ()
145
+ continue
147
146
print (f"Filtering { kaikki_path } " )
148
- filter_wiktextract_file (corpus_config , kaikki_path )
147
+ filter_wiktextract_file (corpus_config , kaikki_path , filtered_kaikki_path )
149
148
flutils .print_separator ()
150
149
151
150
return os .EX_OK
0 commit comments