-
Notifications
You must be signed in to change notification settings - Fork 1
/
export.py
149 lines (129 loc) · 5.05 KB
/
export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import csv
import os
import pprint
import re
import retrying
import service
import sqlite3
import sys
import urllib
import urllib.parse
import urllib.request
TIMESTAMP_PATH = os.path.expanduser('~/.kindle')
def get_lookups(db, timestamp=0):
conn = sqlite3.connect(db)
res = []
for row in conn.execute('select w.stem,l.usage from WORDS as w LEFT JOIN LOOKUPS as l on w.id=l.word_key where w.timestamp>?;', (timestamp,)):
res.append(row)
conn.close()
return res
def get_lookups_from_file(file):
lookups = [(line.split()[0],
' '.join(line.split()[1:])) for line in open(file, 'r').readlines()]
return lookups
def get_last_timestamp_from_lookup(db):
conn = sqlite3.connect(db)
res = conn.execute('select timestamp from WORDS order by timestamp desc limit 1;').fetchall()
conn.close()
return res[0][0] if len (res) > 0 else None
def get_last_timestamp():
try:
with open(TIMESTAMP_PATH, 'r') as tfile:
return int(tfile.readline().strip())
except Exception as e:
print (e)
return 0
def update_last_timestamp(timestamp):
with open(TIMESTAMP_PATH, 'w') as tfile:
tfile.write('{}'.format(timestamp))
def translate(lingualeo, word):
result = lingualeo.get_translates(word)
sound_url = result['sound_url']
pic_url = result['translate'][0]['pic_url']
# tr = result['translate'][0]['value']
tr = [i['value'] for i in result['translate']][:3]
# remove duplicates
tr = '<br>'.join(list(set(tr)))
transcription = result['transcription']
return (tr, transcription, sound_url, pic_url)
def extract_filename_from_url(url):
path = urllib.parse.urlparse(url).path
return os.path.split(path)[-1]
@retrying.retry(stop_max_attempt_number=3)
def download_file(url, path=''):
res = urllib.request.urlretrieve(url,
os.path.join(path,
extract_filename_from_url(url)))
return res
def write_to_csv(file, data):
with open(file, 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter='\t', dialect='unix',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
for row in data:
spamwriter.writerow(row)
def highlight_word_in_context(word, context):
return re.sub(r'{}'.format(word), '<span class=highlight>{}</span>'.format(word), context)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--kindle', help='Path to kindle db file (usually vocab.db)')
parser.add_argument('--src', help='Path to plain text file with newline separated list of words')
parser.add_argument('-m', '--media-path', help='Where to store media files (sounds/images)')
parser.add_argument('email', help='LinguaLeo account email/login')
parser.add_argument('pwd', help='LinguaLeo account password')
parser.add_argument('-o', '--out', help='Output filename', default='output.csv')
args = parser.parse_args()
media_path = args.media_path if args.media_path else ''
output = args.out if args.out else sys.stdout
email = args.email if args.email else ''
password = args.pwd if args.pwd else ''
lingualeo = service.Lingualeo(email, password)
res = lingualeo.auth()
if 'error_msg' in res and res['error_msg']:
print (res['error_msg'])
sys.exit(1)
if args.kindle:
timestamp = get_last_timestamp()
lookups = get_lookups(args.kindle, timestamp)
elif args.src:
lookups = get_lookups_from_file(args.src)
else:
print ("No input specified")
sys.exit(1)
data = []
for i, (word, context) in enumerate(lookups):
progress = int(100.0 * i / len(lookups))
print ('[{}%]\ttranslate {}...'.format(progress, word),
end='', flush=True)
tr, transcription, sound_url, img_url = translate(lingualeo, word)
if sound_url:
print ('ok, get sound...', end='', flush=True)
try:
sound, _ = download_file(sound_url, media_path)
sound = os.path.basename(sound)
except:
sound = ''
if img_url:
print ('ok, get image...', end='', flush=True)
try:
img, _ = download_file(img_url, media_path)
img = os.path.basename(img)
except:
img = ''
print ('ok!')
if not context:
context = ''
# remove all kinds of quotes/backticks as Anki sometimes has troubles
# with them
context = re.sub(r'[\'"`]', '', context)
data.append((word, transcription, '[sound:{}]'.format(sound),
tr, img, highlight_word_in_context(word, context)))
if len(lookups):
print ('[100%]\tWrite to file {}...'.format(output), end='', flush=True)
write_to_csv(output, data)
if args.kindle:
update_last_timestamp(get_last_timestamp_from_lookup(args.kindle))
print ('ok!')
sys.exit(0)