From d786954d9ef94256e09c4012dcff979bdedb7d94 Mon Sep 17 00:00:00 2001 From: David Thomas Date: Wed, 23 May 2018 13:41:10 -0400 Subject: [PATCH] setup for nltk/cltk fixed --- dhelp/text/_bases.py | 9 ++++----- dhelp/text/cltk.py | 16 ++++++++-------- dhelp/text/nltk.py | 17 ++++++----------- 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/dhelp/text/_bases.py b/dhelp/text/_bases.py index 6f3387c..8c891d9 100644 --- a/dhelp/text/_bases.py +++ b/dhelp/text/_bases.py @@ -21,14 +21,13 @@ class BaseText(UserString): >>> print(text) 'Lorem ipsum dolor sit amet...' """ # noqa - options = {} + options = { + 'encoding': 'utf-8', + 'language': 'english' + } def __init__(self, text, *args, **kwargs): super().__init__(str) - self.options = { - 'encoding': 'utf-8', - 'language': 'english' - } # update .options if options keyword arg passed if 'options' in kwargs: if type(kwargs['options']) == dict: diff --git a/dhelp/text/cltk.py b/dhelp/text/cltk.py index db0de7a..ac89358 100644 --- a/dhelp/text/cltk.py +++ b/dhelp/text/cltk.py @@ -256,10 +256,10 @@ class LatinText(CLTKMixin, BaseText): >>> print(text.lemmatize()) gallia edo1 omne divido in pars tres """ - - def __init__(self, text, options={}): - options['language'] = 'latin' - super().__init__(text=text, options=options) + options = { + 'encoding': 'utf-8', + 'language': 'latin' + } def macronize(self, mode='tag_ngram_123_backoff'): """Adds macrons (long vowel marks). @@ -366,10 +366,10 @@ class AncientGreekText(CLTKMixin, BaseText): >>> print(text.lemmatize()) εἰμί δὲ σύμπας οὗτος τὰ σύγγραμμα ἐκεῖνος μάλιστα οὐ ὠφέλιμος , ὅστις ὡς πρὸς οἶδα συγγράφω. """ # noqa - - def __init__(self, text, options={}): - options['language'] = 'greek' - super().__init__(text=text, options=options) + options = { + 'encoding': 'utf-8', + 'language': 'greek' + } def normalize(self): """Fixes problems with differences in greek accent encoding. diff --git a/dhelp/text/nltk.py b/dhelp/text/nltk.py index d13d7bc..d55f0e6 100644 --- a/dhelp/text/nltk.py +++ b/dhelp/text/nltk.py @@ -26,7 +26,7 @@ class NLTKMixin: """ @classmethod - def setup(self, lang_pkgs_info): + def setup(cls): """Download NLTK packages and trainer corpora. Launches the NLTK package download interface. Method is invoked by @@ -42,7 +42,7 @@ def setup(self, lang_pkgs_info): # (1) pkg name (2) list of path segs where pkg data is stored locally pkgs_and_path_segments = settings.NLTK_PACKAGES['all'] # join common list with language specific packages - for package_info in lang_pkgs_info: + for package_info in settings.NLTK_PACKAGES[cls.options['language']]: pkgs_and_path_segments.append(package_info) # loop through list of tuples, each with pkg name and path info for package, package_path_segments in pkgs_and_path_segments: @@ -269,12 +269,7 @@ class EnglishText(NLTKMixin, BaseText): >>> english_text.rm_lines().rm_nonchars().rm_spaces() The quick brown fox jumped over the lazy dog """ # noqa - - def __init__(self, text, options={}): - options['language'] = 'english' - super().__init__(text=text, options=options) - - @classmethod - def setup(self): - # invoke parent setup method, sending it the pkg info for specific lang - super(self.__class__).setup(settings.NLTK_PACKAGES['english']) + options = { + 'encoding': 'utf-8', + 'language': 'english' + }