Skip to content

Commit

Permalink
Add all offline code
Browse files Browse the repository at this point in the history
  • Loading branch information
catus felis committed Oct 6, 2015
1 parent 06e84a0 commit cd6091d
Show file tree
Hide file tree
Showing 56 changed files with 2,202,085 additions and 0 deletions.
Binary file added bin/MSVCR71.dll
Binary file not shown.
1 change: 1 addition & 0 deletions bin/createhtml.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
d:\anaconda\python tab.py -utf TudienAnhVietBeta.txt
2 changes: 2 additions & 0 deletions bin/createmobi.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
copy /y TudienAnhVietBeta-org.opf TudienAnhVietBeta.opf
mobigen.exe -unicode -s0 TudienAnhVietBeta.opf
Binary file added bin/mobigen.exe
Binary file not shown.
Binary file added bin/python25.dll
Binary file not shown.
Binary file added bin/tab2opf.exe
Binary file not shown.
317 changes: 317 additions & 0 deletions bin/tab2opf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,317 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Script for conversion of Stardict tabfile (<header>\t<definition>
# per line) into the OPF file for MobiPocket Dictionary
#
# For usage of dictionary convert it by:
# (wine) mobigen.exe DICTIONARY.opf
#
# MobiPocket Reader at: www.mobipocket.com for platforms:
# PalmOs, Windows Mobile, Symbian (Series 60, Series 80, 90, UIQ), Psion, Blackberry, Franklin, iLiad (by iRex), BenQ-Siemens, Pepper Pad..
# http://www.mobipocket.com/en/DownloadSoft/DownloadManualInstall.asp
# mobigen.exe available at:
# http://www.mobipocket.com/soft/prcgen/mobigen.zip
#
# Copyright (C) 2007 - Klokan Petr Pridal (www.klokan.cz)
#
#
# Version history:
# 0.1 (19.7.2007) Initial version
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Library General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Library General Public License for more details.
#
# You should have received a copy of the GNU Library General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.

# VERSION
VERSION = "0.1"

# FILENAME is a first parameter on the commandline now

import sys
import re
import os

from unicodedata import normalize, decomposition, combining
import string
from exceptions import UnicodeEncodeError

from pattern.en import conjugate, lemma, lexeme

# Hand-made table from PloneTool.py
mapping_custom_1 = {
138: 's', 142: 'z', 154: 's', 158: 'z', 159: 'Y' }

# UnicodeData.txt does not contain normalization of Greek letters.
mapping_greek = {
912: 'i', 913: 'A', 914: 'B', 915: 'G', 916: 'D', 917: 'E', 918: 'Z',
919: 'I', 920: 'TH', 921: 'I', 922: 'K', 923: 'L', 924: 'M', 925: 'N',
926: 'KS', 927: 'O', 928: 'P', 929: 'R', 931: 'S', 932: 'T', 933: 'Y',
934: 'F', 936: 'PS', 937: 'O', 938: 'I', 939: 'Y', 940: 'a', 941: 'e',
943: 'i', 944: 'y', 945: 'a', 946: 'b', 947: 'g', 948: 'd', 949: 'e',
950: 'z', 951: 'i', 952: 'th', 953: 'i', 954: 'k', 955: 'l', 956: 'm',
957: 'n', 958: 'ks', 959: 'o', 960: 'p', 961: 'r', 962: 's', 963: 's',
964: 't', 965: 'y', 966: 'f', 968: 'ps', 969: 'o', 970: 'i', 971: 'y',
972: 'o', 973: 'y' }

# This may be specific to German...
mapping_two_chars = {
140 : 'O', 156: 'o', 196: 'A', 246: 'o', 252: 'u', 214: 'O',
228 : 'a', 220: 'U', 223: 's', 230: 'e', 198: 'E' }

mapping_latin_chars = {
192 : 'A', 193 : 'A', 194 : 'A', 195 : 'a', 197 : 'A', 199 : 'C', 200 : 'E',
201 : 'E', 202 : 'E', 203 : 'E', 204 : 'I', 205 : 'I', 206 : 'I', 207 : 'I',
208 : 'D', 209 : 'N', 210 : 'O', 211 : 'O', 212 : 'O', 213 : 'O', 215 : 'x',
216 : 'O', 217 : 'U', 218 : 'U', 219 : 'U', 221 : 'Y', 224 : 'a', 225 : 'a',
226 : 'a', 227 : 'a', 229 : 'a', 231 : 'c', 232 : 'e', 233 : 'e', 234 : 'e',
235 : 'e', 236 : 'i', 237 : 'i', 238 : 'i', 239 : 'i', 240 : 'd', 241 : 'n',
242 : 'o', 243 : 'o', 244 : 'o', 245 : 'o', 248 : 'o', 249 : 'u', 250 : 'u',
251 : 'u', 253 : 'y', 255 : 'y' }

# Feel free to add new user-defined mapping. Don't forget to update mapping dict
# with your dict.

mapping = {}
mapping.update(mapping_custom_1)
mapping.update(mapping_greek)
mapping.update(mapping_two_chars)
mapping.update(mapping_latin_chars)

# On OpenBSD string.whitespace has a non-standard implementation
# See http://plone.org/collector/4704 for details
whitespace = ''.join([c for c in string.whitespace if ord(c) < 128])
allowed = string.ascii_letters + string.digits + string.punctuation + whitespace

def normalizeUnicode(text, encoding='humanascii'):
"""
This method is used for normalization of unicode characters to the base ASCII
letters. Output is ASCII encoded string (or char) with only ASCII letters,
digits, punctuation and whitespace characters. Case is preserved.
"""
unicodeinput = True
if not isinstance(text, unicode):
text = unicode(text, 'utf-8')
unicodeinput = False

res = ''
global allowed
if encoding == 'humanascii':
enc = 'ascii'
else:
enc = encoding
for ch in text:
if (encoding == 'humanascii') and (ch in allowed):
# ASCII chars, digits etc. stay untouched
res += ch
continue
else:
try:
ch.encode(enc,'strict')
res += ch
except UnicodeEncodeError:
ordinal = ord(ch)
if mapping.has_key(ordinal):
# try to apply custom mappings
res += mapping.get(ordinal)
elif decomposition(ch) or len(normalize('NFKD',ch)) > 1:
normalized = filter(lambda i: not combining(i), normalize('NFKD', ch)).strip()
# normalized string may contain non-letter chars too. Remove them
# normalized string may result to more than one char
res += ''.join([c for c in normalized if c in allowed])
else:
# hex string instead of unknown char
res += "%x" % ordinal
if unicodeinput:
return res
else:
return res.encode('utf-8')

OPFTEMPLATEHEAD1 = """<?xml version="1.0"?><!DOCTYPE package SYSTEM "oeb1.ent">
<!-- the command line instruction 'prcgen dictionary.opf' will produce the dictionary.prc file in the same folder-->
<!-- the command line instruction 'mobigen dictionary.opf' will produce the dictionary.mobi file in the same folder-->
<package unique-identifier="uid" xmlns:dc="Dublin Core">
<metadata>
<dc-metadata>
<dc:Identifier id="uid">%s</dc:Identifier>
<!-- Title of the document -->
<dc:Title><h2>%s</h2></dc:Title>
<dc:Language>EN</dc:Language>
</dc-metadata>
<x-metadata>
"""
OPFTEMPLATEHEADNOUTF = """ <output encoding="Windows-1252" flatten-dynamic-dir="yes"/>"""
OPFTEMPLATEHEAD2 = """
<DictionaryInLanguage>en-us</DictionaryInLanguage>
<DictionaryOutLanguage>en-us</DictionaryOutLanguage>
</x-metadata>
</metadata>
<!-- list of all the files needed to produce the .prc file -->
<manifest>
"""

OPFTEMPLATELINE = """ <item id="dictionary%d" href="%s%d.html" media-type="text/x-oeb1-document"/>
"""

OPFTEMPLATEMIDDLE = """</manifest>
<!-- list of the html files in the correct order -->
<spine>
"""

OPFTEMPLATELINEREF = """ <itemref idref="dictionary%d"/>
"""

OPFTEMPLATEEND = """</spine>
<tours/>
<guide> <reference type="search" title="Dictionary Search" onclick= "index_search()"/> </guide>
</package>
"""

################################################################
# MAIN
################################################################

UTFINDEX = False
if len(sys.argv) > 1:
FILENAME = sys.argv[1]
if sys.argv[1] == '-utf':
UTFINDEX = True
FILENAME = sys.argv[2]
else:
FILENAME = sys.argv[1]
else:
print "tab2opf (Stardict->MobiPocket)"
print "------------------------------"
print "Version: %s" % VERSION
print "Copyright (C) 2007 - Klokan Petr Pridal"
print
print "Usage: python tab2opf.py [-utf] DICTIONARY.tab"
print
print "ERROR: You have to specify a .tab file"
sys.exit(1)

fr = open(FILENAME,'rb')
name = os.path.splitext(os.path.basename(FILENAME))[0]

from sets import Set
#from nltk.corpus import words
from pattern.en import pluralize

# List of common English words
#wordlist = Set(words.words())
wordlist = Set(open("misc/354984si.ngl").read().split())

i = 0
to = False
splitlimit = 10000000
maxcount = splitlimit # 200 #
count = 0
removed = 0
for r in fr.xreadlines():
count += 1

if count > maxcount:
break

if i % splitlimit == 0:
if to:
to.write("""
</mbp:frameset>
</body>
</html>
""")
to.close()
to = open("%s%d.html" % (name, i / splitlimit), 'w')

to.write("""<?xml version="1.0" encoding="utf-8"?>
<html xmlns:idx="www.mobipocket.com" xmlns:mbp="www.mobipocket.com" xmlns:xlink="http://www.w3.org/1999/xlink">
<body>
<mbp:pagebreak/>
<mbp:frameset>
<mbp:slave-frame display="bottom" device="all" breadth="auto" leftmargin="0" rightmargin="0" bottommargin="0" topmargin="0">
<div align="center" bgcolor="yellow"/>
<a onclick="index_search()">Dictionary Search</a>
</div>
</mbp:slave-frame>
<mbp:pagebreak/>
""")

dt, dd = r.split('\t',1)
if not UTFINDEX:
dt = normalizeUnicode(dt,'cp1252')
dd = normalizeUnicode(dd,'cp1252')
dtstrip = normalizeUnicode( dt ).strip()
dd = dd.replace("\\\\","\\").replace("\\n","<br/>\n")
forms = Set(lexeme(dt))
forms.add(pluralize(dt))

toremove = Set()
for w in forms:
if w not in wordlist:
# print("Remove %s" % w)
toremove.add(w)
removed += len(toremove)

forms.difference_update(toremove)

inflections = ''
if len(forms):
inflections = '\t\t\t<idx:infl>\n'
for f in forms:
inflections += '\t\t\t\t<idx:iform value="%s"/>\n' % f
inflections += '\t\t\t</idx:infl>\n'

#print(inflections.encode())
#inflections = inflections.encode('utf-8')

to.write(""" <idx:entry name="word" scriptable="yes">
<h2>
<idx:orth>%s\n%s</idx:orth><idx:key key="%s">
</h2>
%s
</idx:entry>
<mbp:pagebreak/>
""" % (dt, str(inflections), dtstrip, dd))
#print dt
i += 1

to.write("""
</mbp:frameset>
</body>
</html>
""")
to.close()
fr.close()
lineno = i - 1

print("Removed %i" % removed)

to = open("%s.opf" % name, 'w')
to.write(OPFTEMPLATEHEAD1 % (name, name))
if not UTFINDEX:
to.write(OPFTEMPLATEHEADNOUTF)
to.write(OPFTEMPLATEHEAD2)
for i in range(0,(lineno/splitlimit)+1):
to.write(OPFTEMPLATELINE % (i, name, i))
to.write(OPFTEMPLATEMIDDLE)
for i in range(0,(lineno/splitlimit)+1):
to.write(OPFTEMPLATELINEREF % i)
to.write(OPFTEMPLATEEND)
Binary file added bin/w9xpopen.exe
Binary file not shown.
37 changes: 37 additions & 0 deletions dict/TudienAnhVietBeta-org.opf
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?xml version="1.0" encoding="utf-8"?>

<!-- the command line instruction 'prcgen dictionary.opf' will produce the dictionary.prc file in the same folder-->
<!-- the command line instruction 'mobigen dictionary.opf' will produce the dictionary.mobi file in the same folder-->

<package unique-identifier="uid" xmlns:dc="tdanhviet">

<metadata>
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
<dc:Identifier id="uid">tdanhviet</dc:Identifier>
<dc:Title>Từ điển Anh-Việt β</dc:Title>
<dc:Creator>catusf</dc:Creator>
<dc:Date>10/06/2015</dc:Date>
<dc:Copyrights></dc:Copyrights>
<dc:Publisher></dc:Publisher>
<dc:Subject></dc:Subject>
<dc:Language>vi</dc:Language></dc-metadata>
<x-metadata>
<DictionaryInLanguage>en-us</DictionaryInLanguage>
<DictionaryOutLanguage>vi-vn</DictionaryOutLanguage>
</x-metadata>
</metadata>

<!-- list of all the files needed to produce the .prc file -->
<manifest>
<item id="dictionary0" href="TudienAnhVietBeta0.html" media-type="text/x-oeb1-document"/>
</manifest>


<!-- list of the html files in the correct order -->
<spine>
<itemref idref="dictionary0"/>
</spine>

<tours/>
<guide> <reference type="search" title="Dictionary Search" onclick= "index_search()"/> </guide>
</package>
Loading

0 comments on commit cd6091d

Please sign in to comment.