keredson · eric-mh · Nov 18, 2020 · Nov 18, 2020 · Nov 18, 2020 · Nov 19, 2020
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1 @@
-include wordninja_words.txt.gz
+include wordninja_words.txt.bz2
diff --git a/README.md b/README.md
@@ -54,14 +54,16 @@ Custom Language Models
 
 ```
 >>> lm = wordninja.LanguageModel('my_lang.txt.gz')
+>>> lm = wordninja.LanguageModel('my_lang.txt.bz2')  # bzip2 alternative
 >>> lm.split('derek')
 ['der','ek']
 ```
 
-Language files must be gziped text files with one word per line in decreasing order of probability.
+Language files must be gziped or bziped text files with one word per line in decreasing order of probability.
 
 If you want to make your model the default, set:
 
 ```
 wordninja.DEFAULT_LANGUAGE_MODEL = wordninja.LanguageModel('my_lang.txt.gz')
+wordninja.DEFAULT_LANGUAGE_MODEL = wordninja.LanguageModel('my_lang.txt.bz2')
 ```
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
   author_email='[email protected]',
   packages = ['wordninja'],
   url='https://github.com/keredson/wordninja',
-  package_data={'wordninja': ['wordninja_words.txt.gz']},
+  package_data={'wordninja': ['wordninja_words.txt.bz2']},
   package_dir={'wordninja': 'wordninja'},
   include_package_data=True,
   py_modules=['wordninja'],

diff --git a/test.py b/test.py
@@ -23,8 +23,11 @@ def test_apostrophes(self):
     self.assertEqual(list(wordninja.split("that'sthesheriff'sbadge")), ["that's","the","sheriff's","badge"])
 
   def test_custom_model(self):
-    lm = wordninja.LanguageModel('test_lang.txt.gz')
-    self.assertEqual(list(lm.split('derek')), ['der','ek'])
+    lm_bzip = wordninja.LanguageModel('test_lang.txt.bz2')
+    self.assertEqual(list(lm_bzip.split('derek')), ['der','ek'])
+
+    lm_gzip = wordninja.LanguageModel('test_lang.txt.gz')
+    self.assertEqual(list(lm_gzip.split('derek')), ['der','ek'])
 
 if __name__ == '__main__':
     unittest.main()

diff --git a/test_lang.txt.bz2 b/test_lang.txt.bz2
diff --git a/test_lang.txt.gz b/test_lang.txt.gz
diff --git a/wordninja.py b/wordninja.py
@@ -1,8 +1,8 @@
-import gzip, os, re
+import bz2, gzip, os, re
 from math import log
 
 
-__version__ = '2.0.0'
+__version__ = '2.1.0'
 
 
 # I did not author this code, only tweaked it from:
@@ -25,11 +25,23 @@
 #   <list of contractions>
 
 
+class FileTypeMagicBytesRe():
+  BZIP_FILE = re.compile(b'^\\x42\\x5a\\x68')
+  GZIP_FILE = re.compile(b'^\\x1f\\x8b\\x08')
+
+
 class LanguageModel(object):
   def __init__(self, word_file):
     # Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
-    with gzip.open(word_file) as f:
-      words = f.read().decode().split()
+    if check_magic(word_file, FileTypeMagicBytesRe.BZIP_FILE):
+      with bz2.open(word_file) as f:
+        words = f.read().decode().split()
+    elif check_magic(word_file, FileTypeMagicBytesRe.GZIP_FILE):
+      with gzip.open(word_file) as f:
+        words = f.read().decode().split()
+    else:
+      raise ValueError(f"Could not detect compression type of {word_file}. Is it gzip or bzip2?")
+
     self._wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
     self._maxword = max(len(x) for x in words)
 
@@ -77,10 +89,12 @@ def best_match(i):
 
     return reversed(out)
 
-DEFAULT_LANGUAGE_MODEL = LanguageModel(os.path.join(os.path.dirname(os.path.abspath(__file__)),'wordninja','wordninja_words.txt.gz'))
+def check_magic(word_file, magic):
+  with open(word_file, 'rb') as f:
+    return magic.match(f.read())
+
+DEFAULT_LANGUAGE_MODEL = LanguageModel(os.path.join(os.path.dirname(os.path.abspath(__file__)),'wordninja','wordninja_words.txt.bz2'))
 _SPLIT_RE = re.compile("[^a-zA-Z0-9']+")
 
 def split(s):
   return DEFAULT_LANGUAGE_MODEL.split(s)
-
-
diff --git a/wordninja/wordninja_words.txt.bz2 b/wordninja/wordninja_words.txt.bz2
diff --git a/wordninja/wordninja_words.txt.gz b/wordninja/wordninja_words.txt.gz
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		include wordninja_words.txt.gz
		include wordninja_words.txt.bz2