-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
101 lines (97 loc) · 3.27 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import fitz # this is pymupdf
from googletranslate import translate
from translatepython.translate import Translator
from Bing_Translate import BingTranslate
import re
import numpy as np
import time
import random
import os
import glob
myMemoryTranslate = Translator(to_lang="vi")
bingTranslate = BingTranslate()
dct = {}
dem = 0
nameDict = 'dict.npy'
# Load
# np.save(nameDict, dct) # Neu bao loi thi chay cau lenh nay truoc
dct = np.load(nameDict, allow_pickle='TRUE').item()
if os.path.isdir('pdf'):
print('Exists folder pdf')
else:
print('Folder pdf does not exist')
os.mkdir('pdf')
for pathFile in glob.glob('pdf/*.pdf'):
print("FILE:", pathFile)
text = ""
with fitz.open(pathFile) as doc:
for page in doc:
text = text+" "+page.get_text().lower()
result = re.findall("[a-z]{3,15}", text)
result = set(result)
print("Da doc", len(result), "tu")
for word in result:
check = False
add = 0
word = word.lower().strip()
if word in dct:
print(word, "da co trong tu dien")
continue
trans = ""
value = "<div>"
try:
resultGoogle = translate( word, dest='vi', src='en' )
resultGoogle = resultGoogle.lower().strip()
check = True
except Exception as e:
resultGoogle = word
print("Google translate error: ", e)
continue
if resultGoogle != "" and resultGoogle != word:
add += 1
value += "<b>- Google :</b><br /> + {0}<br/>".format(
resultGoogle.lower().strip().capitalize())
trans = trans+"Google: "+resultGoogle + "\t"
else:
continue
try:
resultBing = bingTranslate.translate(word)
resultBing = resultBing.lower().strip()
check = True
except Exception as e:
resultBing = word
print("Bing translate error: ", e)
if resultBing != "" and resultBing != word:
add += 1
value += "<b>- Bing :</b><br /> + {0}<br/>".format(
resultBing.lower().strip().capitalize())
trans = trans+"Bing: " + resultBing+"\t"
try:
resultMyMemory = myMemoryTranslate.translate(word)
resultMyMemory = resultMyMemory.lower().strip()
check = True
except Exception as e:
resultMyMemory = word
print("MyMemory translate error: ", e)
if resultMyMemory != "" and resultMyMemory != word:
add += 1
value += "<b>- MyMemory :</b><br /> + {0}<br/>".format(
resultMyMemory.lower().strip().capitalize())
trans = trans+"Mymemory: "+resultMyMemory
value += "</div>"
dem += 1
if check == False or add < 2:
continue
print("NEW:", word, ":", trans)
dct[word] = value
if dem % 10 == 0:
np.save(nameDict, dct)
print("SAVE IN DICT")
print("Hien co ", len(dct), "tu")
# if dem % 30 == 0:
# timeNgu = random.randint(20, 30)
# print("Sleep ", timeNgu, "s")
# time.sleep(timeNgu)
np.save(nameDict, dct)
print("SAVE IN DICT")
print("Hien co ", len(dct), "tu")