Add all offline code

catusf · Oct 6, 2015 · cd6091d · cd6091d
1 parent 06e84a0
commit cd6091d
Show file tree

Hide file tree

Showing 56 changed files with 2,202,085 additions and 0 deletions.
diff --git a/bin/MSVCR71.dll b/bin/MSVCR71.dll
diff --git a/bin/createhtml.bat b/bin/createhtml.bat
@@ -0,0 +1 @@
+d:\anaconda\python tab.py -utf TudienAnhVietBeta.txt
diff --git a/bin/createmobi.bat b/bin/createmobi.bat
@@ -0,0 +1,2 @@
+copy /y TudienAnhVietBeta-org.opf TudienAnhVietBeta.opf
+mobigen.exe -unicode -s0 TudienAnhVietBeta.opf
diff --git a/bin/mobigen.exe b/bin/mobigen.exe
diff --git a/bin/python25.dll b/bin/python25.dll
diff --git a/bin/tab2opf.exe b/bin/tab2opf.exe
diff --git a/bin/tab2opf.py b/bin/tab2opf.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Script for conversion of Stardict tabfile (<header>\t<definition>
+# per line) into the OPF file for MobiPocket Dictionary
+#
+# For usage of dictionary convert it by:
+# (wine) mobigen.exe DICTIONARY.opf
+#
+# MobiPocket Reader at: www.mobipocket.com for platforms:
+#   PalmOs, Windows Mobile, Symbian (Series 60, Series 80, 90, UIQ), Psion, Blackberry, Franklin, iLiad (by iRex), BenQ-Siemens, Pepper Pad..
+#   http://www.mobipocket.com/en/DownloadSoft/DownloadManualInstall.asp
+# mobigen.exe available at:
+#   http://www.mobipocket.com/soft/prcgen/mobigen.zip
+#
+# Copyright (C) 2007 - Klokan Petr Pridal (www.klokan.cz)
+#
+#
+# Version history:
+# 0.1 (19.7.2007) Initial version
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Library General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Library General Public License for more details.
+#
+# You should have received a copy of the GNU Library General Public
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+# VERSION
+VERSION = "0.1"
+
+# FILENAME is a first parameter on the commandline now
+
+import sys
+import re
+import os
+
+from unicodedata import normalize, decomposition, combining
+import string
+from exceptions import UnicodeEncodeError
+
+from pattern.en import conjugate, lemma, lexeme
+
+# Hand-made table from PloneTool.py
+mapping_custom_1 =  {
+138: 's', 142: 'z', 154: 's', 158: 'z', 159: 'Y' }
+
+# UnicodeData.txt does not contain normalization of Greek letters.
+mapping_greek = {
+912: 'i', 913: 'A', 914: 'B', 915: 'G', 916: 'D', 917: 'E', 918: 'Z',
+919: 'I', 920: 'TH', 921: 'I', 922: 'K', 923: 'L', 924: 'M', 925: 'N',
+926: 'KS', 927: 'O', 928: 'P', 929: 'R', 931: 'S', 932: 'T', 933: 'Y',
+934: 'F', 936: 'PS', 937: 'O', 938: 'I', 939: 'Y', 940: 'a', 941: 'e',
+943: 'i', 944: 'y', 945: 'a', 946: 'b', 947: 'g', 948: 'd', 949: 'e',
+950: 'z', 951: 'i', 952: 'th', 953: 'i', 954: 'k', 955: 'l', 956: 'm',
+957: 'n', 958: 'ks', 959: 'o', 960: 'p', 961: 'r', 962: 's', 963: 's',
+964: 't', 965: 'y', 966: 'f', 968: 'ps', 969: 'o', 970: 'i', 971: 'y',
+972: 'o', 973: 'y' }
+
+# This may be specific to German...
+mapping_two_chars = {
+140 : 'O', 156: 'o', 196: 'A', 246: 'o', 252: 'u', 214: 'O',
+228 : 'a', 220: 'U', 223: 's', 230: 'e', 198: 'E' }
+
+mapping_latin_chars = {
+192 : 'A', 193 : 'A', 194 : 'A', 195 : 'a', 197 : 'A', 199 : 'C', 200 : 'E',
+201 : 'E', 202 : 'E', 203 : 'E', 204 : 'I', 205 : 'I', 206 : 'I', 207 : 'I',
+208 : 'D', 209 : 'N', 210 : 'O', 211 : 'O', 212 : 'O', 213 : 'O', 215 : 'x',
+216 : 'O', 217 : 'U', 218 : 'U', 219 : 'U', 221 : 'Y', 224 : 'a', 225 : 'a',
+226 : 'a', 227 : 'a', 229 : 'a', 231 : 'c', 232 : 'e', 233 : 'e', 234 : 'e',
+235 : 'e', 236 : 'i', 237 : 'i', 238 : 'i', 239 : 'i', 240 : 'd', 241 : 'n',
+242 : 'o', 243 : 'o', 244 : 'o', 245 : 'o', 248 : 'o', 249 : 'u', 250 : 'u',
+251 : 'u', 253 : 'y', 255 : 'y' }
+
+# Feel free to add new user-defined mapping. Don't forget to update mapping dict
+# with your dict.
+
+mapping = {}
+mapping.update(mapping_custom_1)
+mapping.update(mapping_greek)
+mapping.update(mapping_two_chars)
+mapping.update(mapping_latin_chars)
+
+# On OpenBSD string.whitespace has a non-standard implementation
+# See http://plone.org/collector/4704 for details
+whitespace = ''.join([c for c in string.whitespace if ord(c) < 128])
+allowed = string.ascii_letters + string.digits + string.punctuation + whitespace
+
+def normalizeUnicode(text, encoding='humanascii'):
+    """
+    This method is used for normalization of unicode characters to the base ASCII
+    letters. Output is ASCII encoded string (or char) with only ASCII letters,
+    digits, punctuation and whitespace characters. Case is preserved.
+    """
+    unicodeinput = True
+    if not isinstance(text, unicode):
+        text = unicode(text, 'utf-8')
+        unicodeinput = False
+
+    res = ''
+    global allowed
+    if encoding == 'humanascii':
+        enc = 'ascii'
+    else:
+        enc = encoding
+    for ch in text:
+        if (encoding == 'humanascii') and (ch in allowed):
+            # ASCII chars, digits etc. stay untouched
+            res += ch
+            continue
+        else:
+            try:
+                ch.encode(enc,'strict')
+                res += ch
+            except UnicodeEncodeError:
+                ordinal = ord(ch)
+                if mapping.has_key(ordinal):
+                    # try to apply custom mappings
+                    res += mapping.get(ordinal)
+                elif decomposition(ch) or len(normalize('NFKD',ch)) > 1:
+                    normalized = filter(lambda i: not combining(i), normalize('NFKD', ch)).strip()
+                    # normalized string may contain non-letter chars too. Remove them
+                    # normalized string may result to  more than one char
+                    res += ''.join([c for c in normalized if c in allowed])
+                else:
+                    # hex string instead of unknown char
+                    res += "%x" % ordinal
+    if unicodeinput:
+        return res
+    else:
+        return res.encode('utf-8')
+
+OPFTEMPLATEHEAD1 = """<?xml version="1.0"?><!DOCTYPE package SYSTEM "oeb1.ent">
+
+<!-- the command line instruction 'prcgen dictionary.opf' will produce the dictionary.prc file in the same folder-->
+<!-- the command line instruction 'mobigen dictionary.opf' will produce the dictionary.mobi file in the same folder-->
+
+<package unique-identifier="uid" xmlns:dc="Dublin Core">
+
+<metadata>
+	<dc-metadata>
+		<dc:Identifier id="uid">%s</dc:Identifier>
+		<!-- Title of the document -->
+		<dc:Title><h2>%s</h2></dc:Title>
+		<dc:Language>EN</dc:Language>
+	</dc-metadata>
+	<x-metadata>
+"""
+OPFTEMPLATEHEADNOUTF = """		<output encoding="Windows-1252" flatten-dynamic-dir="yes"/>"""
+OPFTEMPLATEHEAD2 = """
+		<DictionaryInLanguage>en-us</DictionaryInLanguage>
+		<DictionaryOutLanguage>en-us</DictionaryOutLanguage>
+	</x-metadata>
+</metadata>
+
+<!-- list of all the files needed to produce the .prc file -->
+<manifest>
+"""
+
+OPFTEMPLATELINE = """ <item id="dictionary%d" href="%s%d.html" media-type="text/x-oeb1-document"/>
+"""
+
+OPFTEMPLATEMIDDLE = """</manifest>
+
+
+<!-- list of the html files in the correct order  -->
+<spine>
+"""
+
+OPFTEMPLATELINEREF = """	<itemref idref="dictionary%d"/>
+"""
+
+OPFTEMPLATEEND = """</spine>
+
+<tours/>
+<guide> <reference type="search" title="Dictionary Search" onclick= "index_search()"/> </guide>
+</package>
+"""
+
+################################################################
+# MAIN
+################################################################
+
+UTFINDEX = False
+if len(sys.argv) > 1:
+    FILENAME = sys.argv[1]
+    if sys.argv[1] == '-utf':
+        UTFINDEX = True
+        FILENAME = sys.argv[2]
+    else:
+        FILENAME = sys.argv[1]
+else:
+    print "tab2opf (Stardict->MobiPocket)"
+    print "------------------------------"
+    print "Version: %s" % VERSION
+    print "Copyright (C) 2007 - Klokan Petr Pridal"
+    print
+    print "Usage: python tab2opf.py [-utf] DICTIONARY.tab"
+    print
+    print "ERROR: You have to specify a .tab file"
+    sys.exit(1)
+
+fr = open(FILENAME,'rb')
+name = os.path.splitext(os.path.basename(FILENAME))[0]
+
+from sets import Set
+#from nltk.corpus import words
+from pattern.en import pluralize
+
+# List of common English words
+#wordlist = Set(words.words())
+wordlist = Set(open("misc/354984si.ngl").read().split())
+
+i = 0
+to = False
+splitlimit = 10000000
+maxcount = splitlimit # 200 # 
+count = 0
+removed = 0
+for r in fr.xreadlines():
+    count += 1
+
+    if count > maxcount:
+        break
+
+    if i % splitlimit == 0:
+        if to:
+            to.write("""
+                </mbp:frameset>
+              </body>
+            </html>
+            """)
+            to.close()
+        to = open("%s%d.html" % (name, i / splitlimit), 'w')
+
+        to.write("""<?xml version="1.0" encoding="utf-8"?>
+<html xmlns:idx="www.mobipocket.com" xmlns:mbp="www.mobipocket.com" xmlns:xlink="http://www.w3.org/1999/xlink">
+  <body>
+    <mbp:pagebreak/>
+    <mbp:frameset>
+      <mbp:slave-frame display="bottom" device="all" breadth="auto" leftmargin="0" rightmargin="0" bottommargin="0" topmargin="0">
+        <div align="center" bgcolor="yellow"/>
+        <a onclick="index_search()">Dictionary Search</a>
+        </div>
+      </mbp:slave-frame>
+      <mbp:pagebreak/>
+""")
+
+    dt, dd =  r.split('\t',1)
+    if not UTFINDEX:
+        dt = normalizeUnicode(dt,'cp1252')
+        dd = normalizeUnicode(dd,'cp1252')
+    dtstrip = normalizeUnicode( dt ).strip()
+    dd = dd.replace("\\\\","\\").replace("\\n","<br/>\n")
+    forms = Set(lexeme(dt))
+    forms.add(pluralize(dt))
+
+    toremove = Set()
+    for w in forms:
+        if w not in wordlist:
+#            print("Remove %s" % w)            
+            toremove.add(w)
+    removed += len(toremove)
+
+    forms.difference_update(toremove)        
+
+    inflections = ''
+    if len(forms):
+        inflections = '\t\t\t<idx:infl>\n'
+        for f in forms:
+            inflections += '\t\t\t\t<idx:iform value="%s"/>\n' % f
+        inflections += '\t\t\t</idx:infl>\n'
+
+    #print(inflections.encode())
+    #inflections = inflections.encode('utf-8')
+
+    to.write("""      <idx:entry name="word" scriptable="yes">
+        <h2>
+          <idx:orth>%s\n%s</idx:orth><idx:key key="%s">
+        </h2>
+        %s
+      </idx:entry>
+      <mbp:pagebreak/>
+""" % (dt, str(inflections), dtstrip, dd))
+    #print dt
+    i += 1
+
+to.write("""
+    </mbp:frameset>
+  </body>
+</html>
+""")
+to.close()
+fr.close()
+lineno = i - 1
+
+print("Removed %i" % removed)
+
+to = open("%s.opf" % name, 'w')
+to.write(OPFTEMPLATEHEAD1 % (name, name))
+if not UTFINDEX:
+    to.write(OPFTEMPLATEHEADNOUTF)
+to.write(OPFTEMPLATEHEAD2)
+for i in range(0,(lineno/splitlimit)+1):
+    to.write(OPFTEMPLATELINE % (i, name, i))
+to.write(OPFTEMPLATEMIDDLE)
+for i in range(0,(lineno/splitlimit)+1):
+    to.write(OPFTEMPLATELINEREF % i)
+to.write(OPFTEMPLATEEND)
diff --git a/bin/w9xpopen.exe b/bin/w9xpopen.exe
diff --git a/dict/TudienAnhVietBeta-org.opf b/dict/TudienAnhVietBeta-org.opf
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="utf-8"?>
+
+<!-- the command line instruction 'prcgen dictionary.opf' will produce the dictionary.prc file in the same folder-->
+<!-- the command line instruction 'mobigen dictionary.opf' will produce the dictionary.mobi file in the same folder-->
+
+<package unique-identifier="uid" xmlns:dc="tdanhviet">
+
+	<metadata>
+		<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
+			<dc:Identifier id="uid">tdanhviet</dc:Identifier>
+			<dc:Title>Từ điển Anh-Việt β</dc:Title>
+			<dc:Creator>catusf</dc:Creator>
+			<dc:Date>10/06/2015</dc:Date>
+			<dc:Copyrights></dc:Copyrights>
+			<dc:Publisher></dc:Publisher>
+			<dc:Subject></dc:Subject>
+		<dc:Language>vi</dc:Language></dc-metadata>
+        <x-metadata>
+            <DictionaryInLanguage>en-us</DictionaryInLanguage>
+            <DictionaryOutLanguage>vi-vn</DictionaryOutLanguage>
+        </x-metadata>
+	</metadata>
+
+<!-- list of all the files needed to produce the .prc file -->
+<manifest>
+ <item id="dictionary0" href="TudienAnhVietBeta0.html" media-type="text/x-oeb1-document"/>
+</manifest>
+
+
+<!-- list of the html files in the correct order  -->
+<spine>
+	<itemref idref="dictionary0"/>
+</spine>
+
+<tours/>
+<guide> <reference type="search" title="Dictionary Search" onclick= "index_search()"/> </guide>
+</package>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		d:\anaconda\python tab.py -utf TudienAnhVietBeta.txt
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		copy /y TudienAnhVietBeta-org.opf TudienAnhVietBeta.opf
		mobigen.exe -unicode -s0 TudienAnhVietBeta.opf