Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Swap gzip with bzip2 #15

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
include wordninja_words.txt.gz
include wordninja_words.txt.bz2
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,16 @@ Custom Language Models

```
>>> lm = wordninja.LanguageModel('my_lang.txt.gz')
>>> lm = wordninja.LanguageModel('my_lang.txt.bz2') # bzip2 alternative
>>> lm.split('derek')
['der','ek']
```

Language files must be gziped text files with one word per line in decreasing order of probability.
Language files must be gziped or bziped text files with one word per line in decreasing order of probability.

If you want to make your model the default, set:

```
wordninja.DEFAULT_LANGUAGE_MODEL = wordninja.LanguageModel('my_lang.txt.gz')
wordninja.DEFAULT_LANGUAGE_MODEL = wordninja.LanguageModel('my_lang.txt.bz2')
```
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
author_email='[email protected]',
packages = ['wordninja'],
url='https://github.com/keredson/wordninja',
package_data={'wordninja': ['wordninja_words.txt.gz']},
package_data={'wordninja': ['wordninja_words.txt.bz2']},
package_dir={'wordninja': 'wordninja'},
include_package_data=True,
py_modules=['wordninja'],
Expand Down
7 changes: 5 additions & 2 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@ def test_apostrophes(self):
self.assertEqual(list(wordninja.split("that'sthesheriff'sbadge")), ["that's","the","sheriff's","badge"])

def test_custom_model(self):
lm = wordninja.LanguageModel('test_lang.txt.gz')
self.assertEqual(list(lm.split('derek')), ['der','ek'])
lm_bzip = wordninja.LanguageModel('test_lang.txt.bz2')
self.assertEqual(list(lm_bzip.split('derek')), ['der','ek'])

lm_gzip = wordninja.LanguageModel('test_lang.txt.gz')
self.assertEqual(list(lm_gzip.split('derek')), ['der','ek'])

if __name__ == '__main__':
unittest.main()
Expand Down
Binary file added test_lang.txt.bz2
Binary file not shown.
Binary file modified test_lang.txt.gz
Binary file not shown.
28 changes: 21 additions & 7 deletions wordninja.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import gzip, os, re
import bz2, gzip, os, re
from math import log


__version__ = '2.0.0'
__version__ = '2.1.0'


# I did not author this code, only tweaked it from:
Expand All @@ -25,11 +25,23 @@
# <list of contractions>


class FileTypeMagicBytesRe():
BZIP_FILE = re.compile(b'^\\x42\\x5a\\x68')
GZIP_FILE = re.compile(b'^\\x1f\\x8b\\x08')


class LanguageModel(object):
def __init__(self, word_file):
# Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
with gzip.open(word_file) as f:
words = f.read().decode().split()
if check_magic(word_file, FileTypeMagicBytesRe.BZIP_FILE):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since we have the filename, why not just check the extension?

Copy link
Author

@eric-mh eric-mh Dec 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just in case there's any discrepancy between the file extension and its type. Although that really depends on what's the expected behavior if it's fed bad inputs.

Should it:

  1. Accept the extension and just throw out errors if it fails to process?
  2. Ignore the extension and just process it best it can?
  3. Ignore the extension, but pop out a warning if the extension doesn't match?

with bz2.open(word_file) as f:
words = f.read().decode().split()
elif check_magic(word_file, FileTypeMagicBytesRe.GZIP_FILE):
with gzip.open(word_file) as f:
words = f.read().decode().split()
else:
raise ValueError(f"Could not detect compression type of {word_file}. Is it gzip or bzip2?")

self._wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
self._maxword = max(len(x) for x in words)

Expand Down Expand Up @@ -77,10 +89,12 @@ def best_match(i):

return reversed(out)

DEFAULT_LANGUAGE_MODEL = LanguageModel(os.path.join(os.path.dirname(os.path.abspath(__file__)),'wordninja','wordninja_words.txt.gz'))
def check_magic(word_file, magic):
with open(word_file, 'rb') as f:
return magic.match(f.read())

DEFAULT_LANGUAGE_MODEL = LanguageModel(os.path.join(os.path.dirname(os.path.abspath(__file__)),'wordninja','wordninja_words.txt.bz2'))
_SPLIT_RE = re.compile("[^a-zA-Z0-9']+")

def split(s):
return DEFAULT_LANGUAGE_MODEL.split(s)


Binary file added wordninja/wordninja_words.txt.bz2
Binary file not shown.
Binary file removed wordninja/wordninja_words.txt.gz
Binary file not shown.