forked from ajesujoba/lafand-mt
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocess_mt5.py
109 lines (75 loc) · 3.34 KB
/
preprocess_mt5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
import unicodedata
import pandas as pd
import os
import jsonlines
def normalize_diacritics_text(text_string):
"""Convenience wrapper to abstract away unicode & NFC"""
return unicodedata.normalize("NFC", text_string)
def create_dir(output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def read_file(input_file):
print(input_file)
with open(input_file, encoding='utf-8') as f:
text_lines = f.read().splitlines()
sentences = [sent for sent in text_lines if len(sent) > 0]
return sentences
def export_json_files(output_dir, filename, df, direction='en-yo'):
to_be_saved = []
src_data = df['source_lang'].values
tgt_data = df['target_lang'].values
src_lang, tgt_lang = direction.split('-')
N_sent = df.shape[0]
for s in range(N_sent):
if tgt_lang=='en':
text_string = {"translation": {src_lang: src_data[s], tgt_lang: tgt_data[s]}}
else:
text_string = {"translation": {src_lang:src_data[s], tgt_lang:tgt_data[s]}}
to_be_saved.append(text_string)
with jsonlines.open(output_dir+filename, 'w') as writer:
writer.write_all(to_be_saved)
def combine_texts(data_dir):
#en_jw = read_file(data_dir + 'jw300.en')
#yo_jw = read_file(data_dir + 'jw300.yo')
menyo_train = pd.read_csv(data_dir+'train.tsv', sep='\t')
en_menyo_train = menyo_train['english'].values
yo_menyo_train = menyo_train['yoruba'].values
menyo_dev = pd.read_csv(data_dir + 'dev.tsv', sep='\t')
en_menyo_dev = menyo_dev['english'].values
yo_menyo_dev = menyo_dev['yoruba'].values
menyo_test = pd.read_csv(data_dir + 'test.tsv', sep='\t')
en_menyo_test = menyo_test['english'].values
yo_menyo_test = menyo_test['yoruba'].values
## merge data
# Train data
train_data_en = list(en_menyo_train) #en_jw + list(en_menyo_train)
train_data_yo = list(yo_menyo_train) #yo_jw + list(yo_menyo_train)
df_train_enyo = pd.DataFrame(train_data_en, columns=['source_lang'])
df_train_enyo['target_lang'] = train_data_yo
df_train_yoen = pd.DataFrame(train_data_yo, columns=['source_lang'])
df_train_yoen['target_lang'] = train_data_en
# dev data
df_dev_enyo = pd.DataFrame(en_menyo_dev, columns=['source_lang'])
df_dev_enyo['target_lang'] = yo_menyo_dev
df_dev_yoen = pd.DataFrame(yo_menyo_dev, columns=['source_lang'])
df_dev_yoen['target_lang'] = en_menyo_dev
# test data
df_test_enyo = pd.DataFrame(en_menyo_test, columns=['source_lang'])
df_test_enyo['target_lang'] = yo_menyo_test
df_test_yoen = pd.DataFrame(yo_menyo_test, columns=['source_lang'])
df_test_yoen['target_lang'] = en_menyo_test
# output data
output_dir = 'data/json_files/en_yo/'
create_dir(output_dir)
export_json_files(output_dir, 'train.json', df_train_enyo)
export_json_files(output_dir, 'dev.json', df_dev_enyo)
export_json_files(output_dir, 'test.json', df_test_enyo)
output_dir = 'data/json_files/yo_en/'
create_dir(output_dir)
export_json_files(output_dir, 'train.json', df_train_yoen, direction='yo-en')
export_json_files(output_dir, 'dev.json', df_dev_yoen, direction='yo-en')
export_json_files(output_dir, 'test.json', df_test_yoen, direction='yo-en')
if __name__ == "__main__":
data_dir = 'data/tsv/eng_to_yor/'
combine_texts(data_dir)