Skip to content
This repository has been archived by the owner on Aug 1, 2021. It is now read-only.

Commit

Permalink
Added support for Turkish chars conversion to similar English chars &…
Browse files Browse the repository at this point in the history
… updated code quality, thanks to @SalihKARAHAN for his contribution
  • Loading branch information
r0oth3x49 committed Dec 24, 2017
1 parent a5c8ed9 commit c35e66f
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 42 deletions.
45 changes: 3 additions & 42 deletions udemy/_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
_search_simple_regex,
unescapeHTML,
)
from ._sanitize import sanitize_title
early_py_version = sys.version_info[:2] < (2, 7)

class Session:
Expand Down Expand Up @@ -65,48 +66,8 @@ def match_id(self, url):
return course_name

def _sanitize_title(self, title):
# Spanish vowels characters to english vowels character
_temp = ''.join([str(ord(i)) if ord(i) > 128 else i for i in title])
if '225' in _temp:
_temp = _temp.replace('225', 'a')
if '233' in _temp:
_temp = _temp.replace('233', 'e')
if '237' in _temp:
_temp = _temp.replace('237', 'i')
if '243' in _temp:
_temp = _temp.replace('243', 'o')
if '250' in _temp:
_temp = _temp.replace('250', 'u')
if '252' in _temp:
_temp = _temp.replace('252', 'u')
if '241' in _temp:
_temp = _temp.replace('241', 'n')
if '191' in _temp:
_temp = _temp.replace('191', '')

if '193' in _temp:
_temp = _temp.replace('193', 'A')
if '201' in _temp:
_temp = _temp.replace('201', 'E')
if '205' in _temp:
_temp = _temp.replace('205', 'I')
if '211' in _temp:
_temp = _temp.replace('211', 'O')
if '218' in _temp:
_temp = _temp.replace('218', 'U')
if '220' in _temp:
_temp = _temp.replace('220', 'U')
if '209' in _temp:
_temp = _temp.replace('209', 'N')

ok = re.compile(r'[^/]')
if os.name == "nt":
ok = re.compile(r'[^\\/:.*?"<>|,]')

_title = ''.join(x if ok.match(x) else "_" for x in _temp)
__title = re.sub('\d+', '', _title)

return __title
_title = sanitize_title(title)
return _title

def _get_csrf_token(self, webpage):
try:
Expand Down
64 changes: 64 additions & 0 deletions udemy/_sanitize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
from ._compat import (
re,
os
)


def sanitize_title(title):
_locale = {
# Turkish Characters to English
# Capital chars
'194' : 'A',
'199' : 'C',
'286' : 'G',
'304' : 'I',
'206' : 'I',
'214' : 'O',
'350' : 'S',
'219' : 'U',
# Small chars
'226' : 'a',
'231' : 'c',
'287' : 'g',
'305' : 'i',
'238' : 'i',
'246' : 'o',
'351' : 's',
'251' : 'u',
# Spanish Characters to English
# Small chars
'191' : '',
'225' : 'a',
'233' : 'e',
'237' : 'i',
'243' : 'o',
'250' : 'u',
'252' : 'u',
'168u' : 'u',
'241' : 'n',
# Capital chars
'193' : 'A',
'201' : 'E',
'205' : 'I',
'211' : 'O',
'218' : 'U',
'220' : 'U',
'168U' : 'U',
'209' : 'N',
}
_temp = ''.join([str(ord(i)) if ord(i) > 128 else i for i in title])
for _ascii,_char in _locale.items():
if _ascii in _temp:
_temp = _temp.replace(_ascii, _char)

ok = re.compile(r'[^/]')
if os.name == "nt":
ok = re.compile(r'[^\\/:.*?"<>|,]')

_title = ''.join(x if ok.match(x) else "_" for x in _temp)

return _title

0 comments on commit c35e66f

Please sign in to comment.