NormalizeUnicode for filenames and url (fixes #28)

klokantech · Jul 25, 2016 · b21f58d · b21f58d
1 parent 2a9875b
commit b21f58d
Show file tree

Hide file tree

Showing 3 changed files with 122 additions and 4 deletions.
diff --git a/app/jekylledit/controllers/base.py b/app/jekylledit/controllers/base.py
@@ -7,6 +7,7 @@
 from flask import Flask, request, url_for
 from flask.ext.babel import Babel
 from ..ext.mailgun import Mailgun
+from ..ext.normalizeUnicode import normalizeUnicode
 
 
 app = Flask('jekylledit')

diff --git a/app/jekylledit/controllers/site.py b/app/jekylledit/controllers/site.py
@@ -5,7 +5,6 @@
 
 import frontmatter
 import hmac
-from unicodedata import normalize
 
 from flask import abort, json, jsonify, request, render_template
 from flask.ext.cors import cross_origin
@@ -14,7 +13,7 @@
 from pid import PidFile, PidFileAlreadyLockedError
 
 from ..model import Repository, Roles, Sites
-from .base import app, mailgun
+from .base import app, mailgun, normalizeUnicode
 from .auth import authorization_required
 
 
@@ -84,8 +83,8 @@ def site_file(site_id, file_id):
         postData = data['post']
         postIsDraft = False
         title = postData[languages[0]]['metadata']['title']
-        normtitle = normalize('NFKD', title).encode('ascii', 'ignore').decode()
-        slugtitle = normtitle.replace(' ', '-').lower()
+        nu = normalizeUnicode('identifier')
+        slugtitle = nu.code(title).lower()
         for i, language in enumerate(languages):
             langdata = postData[language]
             if not 'permalink' in langdata['metadata']:

diff --git a/app/jekylledit/ext/normalizeUnicode.py b/app/jekylledit/ext/normalizeUnicode.py
@@ -0,0 +1,118 @@
+from unicodedata import normalize, decomposition, combining
+import string
+
+class normalizeUnicode:
+
+    # Hand-made table from PloneTool.py
+    mapping_custom_1 =  {
+    138: 's', 142: 'z', 154: 's', 158: 'z', 159: 'Y' }
+
+    # UnicodeData.txt does not contain normalization of Greek letters.
+    mapping_greek = {
+    912: 'i', 913: 'A', 914: 'B', 915: 'G', 916: 'D', 917: 'E', 918: 'Z',
+    919: 'I', 920: 'TH', 921: 'I', 922: 'K', 923: 'L', 924: 'M', 925: 'N',
+    926: 'KS', 927: 'O', 928: 'P', 929: 'R', 931: 'S', 932: 'T', 933: 'Y',
+    934: 'F', 936: 'PS', 937: 'O', 938: 'I', 939: 'Y', 940: 'a', 941: 'e',
+    943: 'i', 944: 'y', 945: 'a', 946: 'b', 947: 'g', 948: 'd', 949: 'e',
+    950: 'z', 951: 'i', 952: 'th', 953: 'i', 954: 'k', 955: 'l', 956: 'm',
+    957: 'n', 958: 'ks', 959: 'o', 960: 'p', 961: 'r', 962: 's', 963: 's',
+    964: 't', 965: 'y', 966: 'f', 968: 'ps', 969: 'o', 970: 'i', 971: 'y',
+    972: 'o', 973: 'y' }
+
+    # This may be specific to German...
+    mapping_two_chars = {
+    140 : 'OE', 156: 'oe', 196: 'Ae', 246: 'oe', 252: 'ue', 214: 'Oe',
+    228 : 'ae', 220: 'Ue', 223: 'ss', 230: 'e', 198: 'E' }
+    #140 : 'O', 156: 'o', 196: 'A', 246: 'o', 252: 'u', 214: 'O',
+    #228 : 'a', 220: 'U', 223: 's', 230: 'e', 198: 'E' }
+
+    mapping_latin_chars = {
+    192 : 'A', 193 : 'A', 194 : 'A', 195 : 'a', 197 : 'A', 199 : 'C', 200 : 'E',
+    201 : 'E', 202 : 'E', 203 : 'E', 204 : 'I', 205 : 'I', 206 : 'I', 207 : 'I',
+    208 : 'D', 209 : 'N', 210 : 'O', 211 : 'O', 212 : 'O', 213 : 'O', 215 : 'x',
+    216 : 'O', 217 : 'U', 218 : 'U', 219 : 'U', 221 : 'Y', 224 : 'a', 225 : 'a',
+    226 : 'a', 227 : 'a', 229 : 'a', 231 : 'c', 232 : 'e', 233 : 'e', 234 : 'e',
+    235 : 'e', 236 : 'i', 237 : 'i', 238 : 'i', 239 : 'i', 240 : 'd', 241 : 'n',
+    242 : 'o', 243 : 'o', 244 : 'o', 245 : 'o', 248 : 'o', 249 : 'u', 250 : 'u',
+    251 : 'u', 253 : 'y', 255 : 'y' }
+
+    # Feel free to add new user-defined mapping. Don't forget to update mapping dict
+    # with your dict.
+
+    mapping = {}
+    mapping.update(mapping_custom_1)
+    mapping.update(mapping_greek)
+    mapping.update(mapping_two_chars)
+    mapping.update(mapping_latin_chars)
+
+    # On OpenBSD string.whitespace has a non-standard implementation
+    # See http://plone.org/collector/4704 for details
+    whitespace = ''.join([c for c in string.whitespace if ord(c) < 128])
+    allowed = string.ascii_letters + string.digits + string.punctuation + whitespace
+    allowedid = string.ascii_letters + string.digits + '-'
+
+    encoding = 'humanascii'
+
+    def __init__ (self, encoding='humanascii'):
+        self.encoding = encoding
+
+
+    def code (self, text):
+        """
+        This method is used for normalization of unicode characters to the base ASCII
+        letters. Output is ASCII encoded string (or char) with only ASCII letters,
+        digits, punctuation and whitespace characters. Case is preserved.
+        """
+        if text == "":
+            return ""
+
+        unicodeinput = True
+        if not isinstance(text, str):
+            text = unicode(text, 'utf-8')
+            unicodeinput = False
+
+        res = ''
+        if self.encoding == 'humanascii' or self.encoding == 'identifier':
+            enc = 'ascii'
+        else:
+            enc = self.encoding
+        for ch in text:
+            if (self.encoding == 'humanascii') and (ch in self.allowed):
+                # ASCII chars, digits etc. stay untouched
+                res += ch
+                continue
+            if (self.encoding == 'identifier') and (ch in self.allowedid):
+                # ASCII chars, digits etc. stay untouched
+                res += ch
+                continue
+            else:
+                try:
+                    ch.encode(enc,'strict')
+                    if self.encoding == 'identifier':
+                        res += '-'
+                    else:
+                        res += ch
+                except UnicodeEncodeError:
+                    ordinal = ord(ch)
+                    if ordinal in self.mapping:
+                        # try to apply custom mappings
+                        res += self.mapping.get(ordinal)
+                    elif decomposition(ch) or len(normalize('NFKD',ch)) > 1:
+                        normalized = filter(lambda i: not combining(i), normalize('NFKD', ch)) #.strip()
+                        # normalized string may contain non-letter chars too. Remove them
+                        # normalized string may result to  more than one char
+                        if self.encoding == 'identifier':
+                            res += ''.join([c for c in normalized if c in self.allowedid])
+                        else:
+                            res += ''.join([c for c in normalized if c in self.allowed])
+                    else:
+                        # hex string instead of unknown char
+                        res += "%x" % ordinal
+        if self.encoding == 'identifier':
+            res = res.strip('-').replace('-----','-').replace('----','-').replace('---','-').replace('--','-')
+            if not res.strip('-')[0] in string.ascii_letters:
+                res = '-' + res
+        if unicodeinput:
+            return res
+        else:
+            return res.encode('utf-8')