-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharabictranslation.py
231 lines (165 loc) · 6.7 KB
/
arabictranslation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# -*- coding: utf-8 -*-
"""ArabicTranslation.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1B-RGly2mMWsKEpdV3q2SNXRA9ZoyNOob
# Machine Translation
1- Import Dependanices
2- import dataset for training
3- Basic EDA and visualization
4- Data Cleaning
5- Tokenization and build vocabalury
6- Pad Sequence and Vectorization
7- Train model
8- Predictions
# Import Libraries
"""
# Commented out IPython magic to ensure Python compatibility.
# basic libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
import seaborn as sns
# %matplotlib inline
# cleaning data
import re
import os
import nltk
nltk.download("stopwords")
nltk.download('punkt')
# save vocabulary in files
import pickle
# tokenization
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional,LSTM, Dropout
from tensorflow.keras.layers import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.callbacks import ModelCheckpoint
"""# Import Dataset"""
df = pd.read_csv("ara_eng.txt",delimiter="\t",names=["English","Arabic"])
df.head()
df.info()
df = df [:20000]
"""# Basic EDA and visualization"""
english_sentences = df['English']
arabic_sentences = df['Arabic']
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
arabic_words_counter = collections.Counter([word for sentence in arabic_sentences for word in sentence.split()])
print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} Arabic words.'.format(len([word for sentence in arabic_sentences for word in sentence.split()])))
print('{} unique Arabic words.'.format(len(arabic_words_counter)))
print('10 Most common words in the Arabic dataset:')
print('"' + '" "'.join(list(zip(*arabic_words_counter.most_common(10)))[0]) + '"')
word_count = df['English'].str.split().apply(len).value_counts()
word_dict = dict(word_count)
word_dict = dict(sorted(word_dict.items(), key=lambda kv: kv[1]))
index = np.arange(len(word_dict))
values1 = word_dict.values()
plt.figure(figsize=(36,5))
plt.bar(index,values1)
plt.xlabel('Length of sentences in English')
plt.ylabel('occurances')
plt.xticks(index,word_dict.keys())
plt.show()
word_count = df['Arabic'].str.split().apply(len).value_counts()
word_dict = dict(word_count)
word_dict = dict(sorted(word_dict.items(), key=lambda kv: kv[1]))
index = np.arange(len(word_dict))
values1 = word_dict.values()
plt.figure(figsize=(36,5))
plt.bar(index,values1)
plt.xlabel('Length of sentences in Arabic')
plt.ylabel('occurances')
plt.xticks(index,word_dict.keys())
plt.show()
"""# Data Cleaning"""
# clean english column
def clean_english(text):
text=text.lower() # lower case
# remove any characters not a-z and ?!,'
text=re.sub(u"[^a-z!?',]"," ",text)
# word tokenization
text=nltk.word_tokenize(text)
# join text
text=" ".join([i.strip() for i in text])
return text
df["English"]=df["English"].apply(lambda txt:clean_english(txt))
### We won't need to clean arabic text
"""# Tokenization"""
def tokenize(x):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)
return tokenizer.texts_to_sequences(x), tokenizer
"""# Padding"""
def pad(x, length=None):
if length is None:
length = max([len(sentence) for sentence in x])
return pad_sequences(x, maxlen = 55, padding = 'post')
"""# Preprocess pipeline"""
def preprocess(x, y):
preprocess_x, x_tk = tokenize(x)
preprocess_y, y_tk = tokenize(y)
preprocess_x = pad(preprocess_x)
preprocess_y = pad(preprocess_y)
# Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
return preprocess_x, preprocess_y, x_tk, y_tk
preproc_english_sentences, preproc_arabic_sentences, english_tokenizer, arabic_tokenizer = preprocess(english_sentences, arabic_sentences)
max_english_sequence_length = preproc_english_sentences.shape[1]
max_arabic_sequence_length = preproc_arabic_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
arabic_vocab_size = len(arabic_tokenizer.word_index)
print("Max English sentence length:", max_english_sequence_length)
print("Max Arabic sentence length:", max_arabic_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("Arabic vocabulary size:", arabic_vocab_size)
""" Convert the final prediction by our model into text form"""
def logits_to_text(logits, tokenizer):
index_to_words = {id: word for word, id in tokenizer.word_index.items()}
index_to_words[0] = '<PAD>'
return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])
"""# Model"""
def model(input_shape, output_sequence_length, english_vocab_size, arabic_vocab_size):
learning_rate = 0.003
# Build the layers
model = Sequential()
model.add(Embedding(arabic_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
model.add(Bidirectional(GRU(256, return_sequences=True)))
model.add(TimeDistributed(Dense(1024, activation='relu')))
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(english_vocab_size, activation='softmax')))
# Compile model
model.compile(loss=sparse_categorical_crossentropy,
optimizer=Adam(learning_rate),
metrics=['accuracy'])
return model
preproc_english_sentences.shape
tmp_x = pad(preproc_arabic_sentences, preproc_arabic_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_arabic_sentences.shape[-2]))
model = model(
tmp_x.shape,
preproc_english_sentences.shape[1],
len(english_tokenizer.word_index)+1,
len(arabic_tokenizer.word_index)+1)
model.summary()
model.fit(tmp_x, preproc_english_sentences, batch_size=64, epochs=10, validation_split=0.2)
model.save('model.h5')
"""# Predictions"""
def translation(i):
print ("Arabic text:", arabic_sentences[i])
print ("\nEnglish Translation:", english_sentences[i])
print ("\nTranslation:", logits_to_text(model.predict(tmp_x[[i]])[0], english_tokenizer))
print(translation(680))
print(translation(220))
print(translation(114))