Skip to content

Commit

Permalink
Merge branch 'master' of github.com:thePortus/dhelp
Browse files Browse the repository at this point in the history
  • Loading branch information
thePortus committed May 24, 2018
2 parents 3555317 + f08575d commit 3d26104
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 27 deletions.
21 changes: 17 additions & 4 deletions dhelp/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
]

NLTK_PACKAGES = {
'english': [
'all': [
('punkt', ['tokenizers', 'punkt.zip']),
('verbnet', ['corpora', 'verbnet.zip']),
('wordnet', ['corpora', 'wordnet.zip']),
('words', ['corpora', 'words.zip']),
('large_grammars', ['grammars', 'large_grammars.zip']),
('large_grammars', ['grammars', 'large_grammars.zip']),
(
'averaged_perceptron_tagger',
Expand All @@ -24,12 +25,24 @@
'maxent_treebank_pos_tagger',
['taggers', 'maxent_treebank_pos_tagger.zip']
),
('universal_tagset', ['taggers', 'universal_tagset.zip']),
('punkt', ['tokenizers', 'punkt.zip']),
('maxent_ne_chunker', ['chunkers', 'maxent_ne_chunker.zip']),
('universal_tagset', ['taggers', 'universal_tagset.zip']),
],
'english': [
('words', ['corpora', 'words.zip']),
('sample_grammars', ['grammars', 'sample_grammars.zip']),
('book_grammars', ['grammars', 'book_grammars.zip']),
('perluniprops', ['misc', 'perluniprops.zip'])
],
'spanish': [
('spanish_grammars', ['grammars', 'spanish_grammars.zip'])
],
'basque': [
('basque_grammars', ['grammars', 'basque_grammars.zip'])
]
}

# TODO: Change CLTK setup so it expects path segments like NLTK settings
CLTK_PACKAGES = {
'greek': [
('greek_software_tlgu', 'software/greek_software_tlgu'),
Expand Down
15 changes: 9 additions & 6 deletions dhelp/text/_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,18 @@ class BaseText(UserString):
>>> print(text)
'Lorem ipsum dolor sit amet...'
""" # noqa
options = {
'encoding': 'utf-8',
'language': 'english'
}

def __init__(self, text, options={}):
def __init__(self, text, *args, **kwargs):
super().__init__(str)
if 'encoding' not in options:
options['encoding'] = 'utf-8'
if 'language' not in options:
options['language'] = 'english'
# update .options if options keyword arg passed
if 'options' in kwargs:
if type(kwargs['options']) == dict:
self.options.update(kwargs['options'])
self.data = text
self.options = options

def __enter__(self):
pass
Expand Down
16 changes: 8 additions & 8 deletions dhelp/text/cltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,10 @@ class LatinText(CLTKMixin, BaseText):
>>> print(text.lemmatize())
gallia edo1 omne divido in pars tres
"""

def __init__(self, text, options={}):
options['language'] = 'latin'
super().__init__(text=text, options=options)
options = {
'encoding': 'utf-8',
'language': 'latin'
}

def macronize(self, mode='tag_ngram_123_backoff'):
"""Adds macrons (long vowel marks).
Expand Down Expand Up @@ -366,10 +366,10 @@ class AncientGreekText(CLTKMixin, BaseText):
>>> print(text.lemmatize())
εἰμί δὲ σύμπας οὗτος τὰ σύγγραμμα ἐκεῖνος μάλιστα οὐ ὠφέλιμος , ὅστις ὡς πρὸς οἶδα συγγράφω.
""" # noqa

def __init__(self, text, options={}):
options['language'] = 'greek'
super().__init__(text=text, options=options)
options = {
'encoding': 'utf-8',
'language': 'greek'
}

def normalize(self):
"""Fixes problems with differences in greek accent encoding.
Expand Down
25 changes: 16 additions & 9 deletions dhelp/text/nltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,27 @@ class NLTKMixin:
"""

@classmethod
def setup(self):
def setup(cls):
"""Download NLTK packages and trainer corpora.
Launches the NLTK package download interface. Overridden by the CLTK
Launches the NLTK package download interface. Method is invoked by
child .setup() methods in NLTK classes. Method is overidden in CLTK
child classes to launch the automated CLTK downloader. Convenience
method if user has not already downloaded NLTK packages and trainer
sets.
Example:
>>> EnglishText.setup()
"""
for package, package_path_segments in settings.NLTK_PACKAGES[
'english'
]:
# start with common pkgs, a list of tuples each with...
# (1) pkg name (2) list of path segs where pkg data is stored locally
pkgs_and_path_segments = settings.NLTK_PACKAGES['all']
# join common list with language specific packages
for package_info in settings.NLTK_PACKAGES[cls.options['language']]:
pkgs_and_path_segments.append(package_info)
# loop through list of tuples, each with pkg name and path info
for package, package_path_segments in pkgs_and_path_segments:
# build the relative filepath to the data, specific to the os
package_path = os.sep.join(package_path_segments)
# will trigger error if no file, if file found, do nothing
try:
Expand Down Expand Up @@ -262,7 +269,7 @@ class EnglishText(NLTKMixin, BaseText):
>>> english_text.rm_lines().rm_nonchars().rm_spaces()
The quick brown fox jumped over the lazy dog
""" # noqa

def __init__(self, text, options={}):
options['language'] = 'english'
super().__init__(text=text, options=options)
options = {
'encoding': 'utf-8',
'language': 'english'
}

0 comments on commit 3d26104

Please sign in to comment.