Added support for Turkish chars conversion to similar English chars &…

… updated code quality, thanks to @SalihKARAHAN for his contribution
r0oth3x49 · Dec 24, 2017 · c35e66f · c35e66f
1 parent a5c8ed9
commit c35e66f
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 42 deletions.
diff --git a/udemy/_extractor.py b/udemy/_extractor.py
@@ -30,6 +30,7 @@
     _search_simple_regex,
     unescapeHTML,
     )
+from ._sanitize import sanitize_title
 early_py_version = sys.version_info[:2] < (2, 7)
 
 class Session:
@@ -65,48 +66,8 @@ def match_id(self, url):
         return course_name
 
     def _sanitize_title(self, title):
-        # Spanish vowels characters to english vowels character
-        _temp   = ''.join([str(ord(i)) if ord(i) > 128 else i for i in title])
-        if '225' in _temp:
-            _temp = _temp.replace('225', 'a')
-        if '233' in _temp:
-            _temp = _temp.replace('233', 'e')
-        if '237' in _temp:
-            _temp = _temp.replace('237', 'i')
-        if '243' in _temp:
-            _temp = _temp.replace('243', 'o')
-        if '250' in _temp:
-            _temp = _temp.replace('250', 'u')
-        if '252' in _temp:
-            _temp = _temp.replace('252', 'u')
-        if '241' in _temp:
-            _temp = _temp.replace('241', 'n')
-        if '191' in _temp:
-            _temp = _temp.replace('191', '')
-
-        if '193' in _temp:
-            _temp = _temp.replace('193', 'A')
-        if '201' in _temp:
-            _temp = _temp.replace('201', 'E')
-        if '205' in _temp:
-            _temp = _temp.replace('205', 'I')
-        if '211' in _temp:
-            _temp = _temp.replace('211', 'O')
-        if '218' in _temp:
-            _temp = _temp.replace('218', 'U')
-        if '220' in _temp:
-            _temp = _temp.replace('220', 'U')
-        if '209' in _temp:
-            _temp = _temp.replace('209', 'N')
-
-        ok = re.compile(r'[^/]')
-        if os.name == "nt":
-            ok = re.compile(r'[^\\/:.*?"<>|,]')
-
-        _title      = ''.join(x if ok.match(x) else "_" for x in _temp)
-        __title     = re.sub('\d+', '', _title)
-
-        return __title
+        _title = sanitize_title(title)
+        return _title
 
     def _get_csrf_token(self, webpage):
         try:

diff --git a/udemy/_sanitize.py b/udemy/_sanitize.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+from ._compat import (
+                        re, 
+                        os
+                    )
+
+
+def sanitize_title(title):
+    _locale = {
+    # Turkish Characters to English
+        # Capital chars
+                '194'  : 'A',
+                '199'  : 'C',
+                '286'  : 'G',
+                '304'  : 'I',
+                '206'  : 'I',
+                '214'  : 'O',
+                '350'  : 'S',
+                '219'  : 'U',
+        # Small chars
+                '226'  : 'a',
+                '231'  : 'c',
+                '287'  : 'g',
+                '305'  : 'i',
+                '238'  : 'i',
+                '246'  : 'o',
+                '351'  : 's',
+                '251'  : 'u',
+    # Spanish Characters to English
+        # Small chars
+                '191'  : '',
+                '225'  : 'a',
+                '233'  : 'e',
+                '237'  : 'i',
+                '243'  : 'o',
+                '250'  : 'u',
+                '252'  : 'u',
+                '168u' : 'u',
+                '241'  : 'n',
+        # Capital chars
+                '193'  : 'A',
+                '201'  : 'E',
+                '205'  : 'I',
+                '211'  : 'O',
+                '218'  : 'U',
+                '220'  : 'U',
+                '168U' : 'U',
+                '209'  : 'N',
+    }
+    _temp   = ''.join([str(ord(i)) if ord(i) > 128 else i for i in title])
+    for _ascii,_char in _locale.items():
+        if _ascii in _temp:
+            _temp = _temp.replace(_ascii, _char)
+
+    ok = re.compile(r'[^/]')
+    if os.name == "nt":
+        ok = re.compile(r'[^\\/:.*?"<>|,]')
+
+    _title      = ''.join(x if ok.match(x) else "_" for x in _temp)
+
+    return _title