Skip to content

Commit

Permalink
Force 'utf-8' encoding without relying on platform-dependent default
Browse files Browse the repository at this point in the history
On Windows, the default encoding is 'cp1252' and this raises a UnicodeDecodeError.

Fix fastai#5
  • Loading branch information
albertvillanova committed Jul 13, 2019
1 parent 85e5052 commit 3d11564
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions nlputils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_wiki(path,lang):
shutil.rmtree(path/'text')


def split_wiki(path,lang):
def split_wiki(path,lang,encoding='utf-8'):
dest = path/'docs'
name = f'{lang}wiki'
if dest.exists():
Expand All @@ -35,7 +35,7 @@ def split_wiki(path,lang):

dest.mkdir(exist_ok=True, parents=True)
title_re = re.compile(rf'<doc id="\d+" url="https://{lang}.wikipedia.org/wiki\?curid=\d+" title="([^"]+)">')
lines = (path/name).open()
lines = (path/name).open(encoding=encoding)
f=None

for i,l in enumerate(lines):
Expand All @@ -44,7 +44,7 @@ def split_wiki(path,lang):
title = title_re.findall(l)[0].replace('/','_')
if len(title)>150: continue
if f: f.close()
f = (dest/f'{title}.txt').open('w')
f = (dest/f'{title}.txt').open('w', encoding=encoding)
else: f.write(l)
f.close()
return dest
Expand Down

0 comments on commit 3d11564

Please sign in to comment.