Skip to content

Commit

Permalink
nltk/cltk auto-setup provisionally added
Browse files Browse the repository at this point in the history
  • Loading branch information
thePortus committed Mar 4, 2018
1 parent 77d2dd0 commit eb81325
Show file tree
Hide file tree
Showing 9 changed files with 215 additions and 100 deletions.
8 changes: 4 additions & 4 deletions dhelp/files/folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def files(self):
"""
dir_files = []
for folder_item in self.contents:
if os.path.isfile(folder_item):
dir_files.append(folder_item)
if os.path.isfile(os.path.join(self.data, folder_item)):
dir_files.append(os.path.join(self.data, folder_item))
return dir_files

@property
Expand All @@ -73,8 +73,8 @@ def folders(self):
"""
dir_subdirs = []
for folder_item in self.contents:
if os.path.isdir(folder_item):
dir_subdirs.append(folder_item)
if os.path.isdir(os.path.join(self.data, folder_item)):
dir_subdirs.append(os.path.join(self.data, folder_item))
return dir_subdirs

@property
Expand Down
10 changes: 10 additions & 0 deletions dhelp/files/tests/test_folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ def test_contents(self):
exempla = self.make_test_obj()
return self.assertTrue(len(exempla.contents) == 5)

def test_files(self):
# should be 5 items in the folder
exempla = self.make_test_obj()
return self.assertTrue(len(exempla.files) == 5)

def test_folders(self):
# should be no length since there area no folders
exempla = self.make_test_obj()
return self.assertEqual(len(exempla.folders), 0)

def test_length(self):
# should have 5 items in the folder
exempla = self.make_test_obj()
Expand Down
1 change: 1 addition & 0 deletions dhelp/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
"""

from .basic_text import BasicText
from .install import NLTKInstall, CLTKInstall
179 changes: 179 additions & 0 deletions dhelp/text/install.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#!/usr/bin/python

""" dhelp/text/install.py
David J. Thomas
Contains functions to automatically download necessary nltk packages and
trainer data sets.
"""

import os
from collections import UserString
import nltk


class BaseInstall(UserString):
"""
Parent class for both NLTK and CLTK installer classes. Not meant to be used
directly.
Parameters
----------
language : :obj:`str`, optional
Desired language for install, defaults to english
Raises
------
Exception
If argument 'language' is non-string
"""

def __init__(self, language=None):
if not language:
language = 'english'
elif type(language) is not str:
raise Exception('"language" was not a string.')
self.data = language


class NLTKInstall(BaseInstall):
"""
Provides functions to automatically dicover all possible packages, check
it against local data, and download any missing data. If you have not
already downloaded the NLTK trainer data, you should must use this.
Parameters
----------
language : :obj:`str`, optional
Desired language for install, defaults to english
Raises
------
Exception
If argument 'language' is non-string
Example
-------
>>> from dhelp.text import NLTKInstall
>>> NLTKInstall('english')
'english'
"""

@property
def data_root(self):
"""
Returns the absolute path pointing to the cltk data root on the
local machine.
Returns
-------
Absolute path to the nltk data directory
Example
-------
>>> NLTKInstall().data_root
"""
return os.path.expanduser('~/nltk_data')

def setup(self):
"""
Launches NLTK download interface, either GUI or CLI, depending on OS
Returns
-------
:obj:`bool`
True if successful
"""
nltk.download()
return True


class CLTKInstall(BaseInstall):
"""
Provides functions to automatically dicover all possible packages, check
it against local data, and download any missing data. If you have not
already downloaded the CLTK trainer data, you should must use this.
Parameters
----------
language : :obj:`str`, optional
Desired language for install, defaults to latin
Raises
------
Exception
If argument 'language' is non-string
Example
-------
>>> from dhelp.text import CLTKInstall
>>> CLTKInstall('latin')
'latin'
"""

def __init__(self, language=None):
if not language:
language = 'latin'
super().__init__(language=language)

@property
def data_root(self):
"""
Returns the absolute path pointing to the cltk data root on the
local machine.
Returns
-------
Absolute path to the cltk data directory
Example
-------
>>> CLTKInstall('latin').data_root
"""
return os.path.expanduser('~/cltk_data')

@property
def corpora_list(self):
"""
Gets a list of all possible corpora, downloaded from CLTK.
Returns
-------
:obj:`list`
List of cltk corpora for specified language
Example
-------
>>> CLTKInstall('latin').corpora_list
['latin_text_perseus', 'latin_treebank_perseus', 'latin_text_latin_library', 'phi5', 'phi7', 'latin_proper_names_cltk', 'latin_models_cltk', 'latin_pos_lemmata_cltk', 'latin_treebank_index_thomisticus', 'latin_lexica_perseus', 'latin_training_set_sentence_cltk', 'latin_word2vec_cltk', 'latin_text_antique_digiliblt', 'latin_text_corpus_grammaticorum_latinorum', 'latin_text_poeti_ditalia'] # noqa
"""
# import cltk inline
from cltk.corpus.utils.importer import CorpusImporter
return CorpusImporter(self.data).list_corpora

def setup(self):
"""
Gets list of all corpora and downloads them.
Returns
-------
:obj:`bool`
True if downloading was successful, or at least no fatal errors
Example
-------
>>> CLTKInstall('latin').missing_corpora
['latin_text_corpus_grammaticorum_latinorum', 'latin_text_poeti_ditalia'] # noqa
"""
from cltk.corpus.utils.importer import CorpusImporter
corpus_importer = CorpusImporter(self.data)
for missing_corpus in self.corpora_list:
print('Downloading', missing_corpus)
try:
corpus_importer.import_corpus(missing_corpus)
except:
print('Problem downloading', missing_corpus, '(skipping)')
print('Finished downloading corpora')
return True
75 changes: 0 additions & 75 deletions dhelp/text/setup.py

This file was deleted.

19 changes: 19 additions & 0 deletions dhelp/text/tests/test_install.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/python

import unittest

from ..install import NLTKInstall, CLTKInstall


class TestNLTKInstall(unittest.TestCase):
test_class = NLTKInstall

def test_some_method(self):
pass


class TestCLTKInstall(unittest.TestCase):
test_class = CLTKInstall

def test_some_method(self):
pass
12 changes: 0 additions & 12 deletions dhelp/text/tests/test_setup.py

This file was deleted.

6 changes: 0 additions & 6 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,3 @@ universal = 1
[metadata]
license_file = LICENSE
description-file = README.md

[nosetests]
verbosity=1
detailed-errors=1
with-coverage=1
cover-package=nose
5 changes: 2 additions & 3 deletions unittest.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,5 @@ plugins = nose2.plugins.layers

[coverage]
always-on = True
coverage-report =
term-missing
html
coverage-report = term-missing
html

0 comments on commit eb81325

Please sign in to comment.