diff --git a/.travis.yml b/.travis.yml index f8ddcf1..5d64bc9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ os: linux dist: trusty # set environment variables env: - - PACKAGE_VERSION=0.0.2 + - PACKAGE_VERSION=0.0.4 # install dependencies install: - pip install -r requirements/dev.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ee031d..550bc4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [0.0.4] + +* Changes + * Web module reorganized into main package + ## [0.0.3] * Changes diff --git a/README.md b/README.md index e0aea97..9b6b2ef 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,6 @@ --- [![PyPI version](https://badge.fury.io/py/dhelp.svg)](https://badge.fury.io/py/dhelp) -![PyPI - License](https://img.shields.io/pypi/l/Django.svg) [![Build Status](https://travis-ci.org/thePortus/dhelp.svg?branch=master)](https://travis-ci.org/thePortus/dhelp) [![Coverage Status](https://coveralls.io/repos/github/thePortus/dhelp/badge.svg?branch=master)](https://coveralls.io/github/thePortus/dhelp?branch=master) [![Documentation Status](https://readthedocs.org/projects/dhelp/badge/?version=latest)](http://dhelp.readthedocs.io/en/latest/?badge=latest) [![Code Health](https://landscape.io/github/thePortus/dhelp/master/landscape.svg?style=flat)](https://landscape.io/github/thePortus/dhelp/master) [![Total GitHub downloads](https://img.shields.io/github/downloads/thePortus/dhelp/total.svg)](https://img.shields.io/github/downloads/thePortus/dhelp/total.svg) [![Waffle.io - Columns and their card count](https://badge.waffle.io/thePortus/dhelp.svg?columns=all)](https://waffle.io/thePortus/dhelp) @@ -82,7 +81,7 @@ The first time you use a language-specific text object, you need to run its .set ```sh from dhelp import EnglishText -EnglishText('').setup() +EnglishText.setup() ``` @@ -103,6 +102,15 @@ single line of code. from dhelp import TextFile +# quickest method to modify a file, start by making a TextFile object... +txt_file = TextFile('some/file.txt') +# then use with/as syntax to give you the file contents in strings form +with txt_file as txt_data: + # txt_data is contents, whatever you put in txt_file.save_data is saved + txt_file.save_data = txt_data.replace('\n', '') + +# Other methods... + # load file data as a string and print to screen text_file = TextFile('some/file.txt') text_file.load() @@ -146,6 +154,22 @@ TextFile('some/other-file.txt').remove() from dhelp import TextFolder +# quickest way to modify a folder, start by making a TextFolder object +text_folder = TextFolder('some/path') +# use with/as syntax to get a list of TextFile objects, then loop through +with text_folder as txt_files: + for txt_file in txt_files: + # use with/as syntax on file to get contents + with txt_file as txt_data: + # whatever you store in .save_data will be saved to file + txt_file.save_data = txt_data.replace('\n', '') + +``` + +**Other Methods** + +```python + # returns a list of TextFile objects, each connected to a file in the folder folder_files = TextFolder('some/folder').text_files # You can loop through and load, edit, save, et.c. the TextFiles as normal @@ -291,7 +315,7 @@ Before you use this object for any of the methods below you need to download tra ```python from dhelp import EnglishText -EnglishText('').setup() +EnglishText.setup() ``` **Examples** diff --git a/TODO.md b/TODO.md index d9955c7..cbec6a1 100644 --- a/TODO.md +++ b/TODO.md @@ -2,7 +2,7 @@ # General or Current TODO Items -* Revise and add to README.md, provide function-level documentation +* Add with/as context manager to TextFile, TextFolder, and CSVFile # TODO Items by Module diff --git a/dhelp/__init__.py b/dhelp/__init__.py index f7d2e35..f5a3434 100644 --- a/dhelp/__init__.py +++ b/dhelp/__init__.py @@ -11,10 +11,6 @@ manipulations, and even text analysis. """ -from .files.csv_file import CSVFile -from .files.text_file import TextFile -from .files.text_folder import TextFolder -from .web.web_page import WebPage -from .text.english import EnglishText -from .text.latin import LatinText -from .text.ancient_greek import AncientGreekText +from .files import TextFile, TextFolder, CSVFile +from .web import WebPage +from .text import EnglishText, LatinText, AncientGreekText diff --git a/dhelp/files/__init__.py b/dhelp/files/__init__.py index 013e4b7..e33480e 100644 --- a/dhelp/files/__init__.py +++ b/dhelp/files/__init__.py @@ -1 +1,4 @@ #!/usr/bin/python + +from .txt import TextFile, TextFolder +from .csv import CSVFile diff --git a/dhelp/files/_bases.py b/dhelp/files/_bases.py new file mode 100644 index 0000000..ea0e45b --- /dev/null +++ b/dhelp/files/_bases.py @@ -0,0 +1,528 @@ +#!/usr/bin/python + +import os +import errno +import shutil +from collections import UserString, deque + + +class BasePath(UserString): + """ + Used to interact with a system path in various ways. Not generally meant to + be used directly, BasePath is parent to various Folder and File classes. + + Args: + path (:obj:`str`) System path pointing to desired text file location + + Raises: + Exception: If a non-string arg is sent as path + """ + options = {} + + def __init__(self, path=None, *args, **kwargs): + # call parent class constructor and set to a string + super().__init__(str) + # raise error if path not string or set path to current dir if not set + if path and type(path) is not str: + raise Exception('path is not a string') + elif not path: + path = os.getcwd() + # or if relative path sent, convert to absolute path + elif not os.path.isabs(path): + path = os.path.abspath(os.path.join(os.getcwd(), path)) + # set default options + self.options = { + 'silent': False, + 'overwrite': False, + 'encoding': 'utf-8', + 'newline': '', + 'readlines': False, + 'delimiter': ',', + 'dialect': 'excel', + 'extensions': ['txt'] + } + # update .options if options keyword arg passed + if 'options' in kwargs: + if type(kwargs['options']) == dict: + self.options.update(kwargs['options']) + # store path as string + self.data = path + + def __enter__(self): + return self.load() + + def __exit__(self, ctx_type, ctx_value, ctx_traceback): + options = self.options + options['overwrite'] = True + # write over previous file data + if type(self.save_data == str): + return self.save(self.save_data, options=options) + + @property + def exists(self): + """Check if anything exists at the current path. + + Returns: + :obj:`bool` True if anything exists at path, False if not + + Example: + >>> BasePath('some/extant/path').exists() + True + >>> BasePath('some/non-extant/path').exists() + False + + """ + return os.path.exists(self.data) + + @property + def size(self): + """Get file/folder size of anything at the current path. + + Returns the size of any item at the specified path in bytes, returns + 0 if non-extant. + + Returns: + :obj:`int` Size of item at path, in bytes + + Example: + >>> BasePath('some/path.txt') + 121 + """ + # return zero if nothing present + if not self.exists: + return 0 + return os.path.getsize(self.data) + + @property + def basename(self): + """Get file/folder name of current path. + + Returns the basename (last element of path) of the current path + e.g. the name of the current file or folder. + + Returns: + :obj:`str` Name of current file or folder + + Example: + >>> BasePath('some/path.txt') + 'path.txt' + """ + return os.path.basename(self.data) + + @property + def dirname(self): + """Get parent directory path. + + Returns the absolute path of the parent directory of the current path. + + Returns: :obj:`str` Name of parent directory of current path + + Example: + >>> BasePath('some/path.txt').dirname() + '/absolute/path/to/some' + + """ + return os.path.dirname(self.data) + + @property + def is_dir(self): + """Check if path is a directory. + + Returns true if path points to existing directory. + + Returns: :obj:`bool` True if path points to directory, False if not + + Examples: + >>> BasePath('some/path').is_dir() + True + >>> BasePath('some/path.txt').is_dir() + False + """ + return os.path.isdir(self.data) + + @property + def is_file(self): + """Check if path is a file. + + Returns true if path points to existing file. + + Returns: + :obj:`bool` True if path points to file, False if not + + Examples: + >>> BasePath('some/path.txt').is_file() + True + >>> BasePath('some/path').is_file() + False + """ + return os.path.isfile(self.data) + + @property + def is_link(self): + """Check if path is a link. + + Returns true if path points to symbolic link. + + Returns: + :obj:`bool` True if path points to symbolic link, False if not + + Examples: + >>> BasePath('some/link.txt').is_link() + True + >>> BasePath('nota/link').is_link() + False + """ + if not self.exists: + return False + return os.path.islink(self.data) + + def copy(self, destination, *args, **kwargs): + """Copy data at path to another location. + + Copies the contents at system path (if a folder, copies it's contents + recursively) to a specified destination. Returns a new version of the + object linked to the new location. Will raise an error if anything + exists at the destination unless overwrite option is flagged. + + Args: + destination (:obj:`str`) System path to which you want to copy item(s) at current path + options (:obj:`dict`, optional) Options settings found at respective keywords + + Returns: + :obj:`self.__class__` New instance of object tied to the copied path + + Raises: + Exception: If a problem is encountered when copying + + Example: + >>> BasePath('some/path').copy('some/other-path') + 'some/other-path' + """ # noqa + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) + # ensure is an absolute path + if not os.path.isabs(destination): + destination = os.path.abspath(destination) + # if destination already exists and overwrite option not set, abort + if os.path.exists(destination) and not options['overwrite']: + raise Exception('Cannot copy, item exists at ' + str(destination)) + # attempt to copy location recursively + try: + if self.is_file: + shutil.copy(self.data, destination) + else: + shutil.copytree(self.data, destination) + # raise exception msg if error encountered + except: + raise Exception( + 'Error copying. Source:', + self.data, + 'Destination', + destination + ) + # return new version of object that is linked to copied location + return self.__class__(destination) + + def remove(self, *args, **kwargs): + """Delete item(s) at current path. + + Deletes any item at the current path. If a folder deletes contents + recursively. Returns True if successful. + + Returns: + :obj:`bool` True if successful + + Raises: + Exception: If any issue was encountered deleting item(s) at path + + Example: + >>> BasePath('some/path').remove() + True + """ + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) + try: + if self.is_file: + os.remove(self.data) + else: + shutil.rmtree(self.data) + except: + raise Exception('Error removing item at ' + self.data) + return True + + def move(self, destination, *args, **kwargs): + """Moves item(s) from current path to another location. + + Effectively moves anything at the given path to the specified location. + Calls .copy() with destination, then .remove() the current path, before + finally the results of .copy(). + + Args: + destination (:obj:`str`) System path to which you want to move item(s) at current path + options (:obj:`dict`, optional) Options settings found at respective keywords + + Returns: + :obj:`self.__class__` New instance of object tied to destination path + + Example: + >>> BasePath('some/path').move('some/other-path') + 'some/other-path' + """ # noqa + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) + new_path_obj = self.copy(destination, options=options) + self.remove() + return new_path_obj + + def load(self, *args, **kwargs): + """Loading method called by child classes. + + Called by child class load methods, stops from loading non-extant file. + + Args: + options :obj:`dict`, optional Options settings found at respective keywords + + Raises: + Exception: If nothing exists at path + """ # noqa + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) + # print loading message if silent option not flagged + if not options['silent']: + print('Loading', self.data) + if not self.exists: + raise Exception('Cannot open item, nothing exists at' + self.data) + + def save(self, *args, **kwargs): + """Saving method called by child classes. + + Called by child class save methods, prevents overwrite without option. + + Args: + options :obj:`dict`, optional Options settings found at respective keywords + + Raises: + Exception: If something exists at path and overwrite option is not set + """ # noqa + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) + # print saving message if silent option not flagged + if not options['silent']: + print('Saving to', self.data) + if self.exists and options['overwrite'] is not True: + raise Exception( + 'Item exists at ' + self.data + ' and overwrite not specified' + ) + # create all parent directories required for save + self.makedirs() + return self + + def makedirs(self, *args, **kwargs): + """Create any missing parent directories of current path. + + Automatically creates any parent directories of the current path + that do not already exist. This function is used by the .save() + method before saving to a location to avoid errors. + + Example: + >>> BasePath(some/path).makedirs() + some/path + """ + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) + # if parent directory is non-extant + if not os.path.exists(os.path.dirname(self.data)): + # attempt to make parent directories + try: + os.makedirs(os.path.dirname(self.data)) + # raise an error if somehow directories were created after check + except OSError as exc: + if exc.errno != errno.EEXist: + raise + return self + + +class BaseFile(BasePath): + """Parent class for TextFile and other file related classes. + """ # noqa + pass + + +class BaseFolder(BasePath): + """Parent class for TextFolder and other folder related classes. + + Base parent class to all folder utility objects, not meant to be used on + its own. Child classes inherit these functions to work with specific + file types. + + Args: + path (:obj:`str`) System path pointing to desired folder location + + Attributes: + file_class (:obj:``) Class to use when constructing files + contents (:obj:`list` of :obj:`str`) List of file and folder names + length (:obj:`int`) Total number of items at top level insider folder + filenames (:obj:`list` of :obj:`str`) List of only file names + folders (:obj:`list` of `:obj:`str`) List of only subfolder names + + Methods: + files (:obj:`list` of :obj:`self.file_class`) List of BaseFile objects + + + Examples: + >>> # load folder object + >>> folder = BaseFolder('some/path') + '/absolute/path/to/some/path' + + >>> # load files in folder as list of BaseFile objects + >>> BaseFolder('some/path').load() + [, , ] + + >>> # use the following context to fast-load/edit/save all files + >>> with BaseFolder('some/path') as txt_files: + ... for txt_file in txt_files: + ... with txt_file as txt_data: + ... txt_file.save_data = txt_data.replace('\\n', '') + """ # noqa + file_class = BaseFile + + def __enter__(self): + return self.files() + + def __exit__(self, ctx_type, ctx_value, ctx_traceback): + pass + + @property + def contents(self): + """Lists contents of folder. + + Returns: + :obj:`list` of :obj:`str` File/folder names. + + Example: + >>> Folder('some/path').contents + ['file_1.txt', 'file_2.txt', 'file_3.txt', 'subfolder_1', 'subfolder_2', 'subfolder_3'] + """ # noqa + if not self.exists or not self.is_dir: + return None + return os.listdir(self.data) + + @property + def length(self): + """Convenience method to get the len() of the folder contents. + + Returns: + :obj:`int` Number of items in the folder + + Example: + Folder('some/path').length + 3 + """ + return len(self.contents) + + @property + def filenames(self): + """Returns .contents with non-files filtered. + + Grabs names of directory contents before joining them with the current + path to return list of absolute paths to all files in the directory. + + Returns: + :obj:`list` of :obj:`str` File names + + Example: + >>> Folder(some/path).filenames + ['/absolute/path/to/some/path/file_1.txt', '/absolute/path/to/some/path/file_2.txt', /absolute/path/to/some/path/file_3.txt] + """ # noqa + dir_files = [] + for folder_item in self.contents: + if os.path.isfile(os.path.join(self.data, folder_item)): + dir_files.append(os.path.join(self.data, folder_item)) + return dir_files + + @property + def folders(self): + """Returns .contents with non-folders filtered. + + Grabs names of directory contents before joining them with the current + path to return list of absolute paths to all folders in the directory. + + Returns: + :obj:`list` of :obj:`str` Folder names + + Example: + >>> Folder(some/path).folders + ['subfolder_1', 'subfolder_2', 'subfolder_3'] + """ + dir_subdirs = [] + for folder_item in self.contents: + if os.path.isdir(os.path.join(self.data, folder_item)): + dir_subdirs.append(os.path.join(self.data, folder_item)) + return dir_subdirs + + def files(self, *args, **kwargs): + """ Load all .txt files as BaseFile objects. + + All current files inside the folder at the current path will + be returned as a deque(list) of TextFile objects. You can set which + file extensions will be loaded with the 'extensions' option by passing + a list of string extensions (without the '.'). + + Args: + options (:obj:`dict`, optional) Options settings found at respective keywords + + Returns: + :obj:`collections.deque` of `:obj:`dhelp.TextFile` TextFiles of each .txt file (or other filetype) + + Raises: + Exception: If path does not point to folder + TypeError: If non-list is sent as extensions option + + Examples: + >>> folder_files = BaseFolder('some/path').files() + >>> for folder_file in folder_files: + ... print(folder_file.load()) + Lorem ipsum dolor sit amet... + """ # noqa + contents = deque([]) + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) + if type(options['extensions']) is not list: + raise TypeError('Option "extensions" must be list') + if not self.is_dir: + raise Exception('Item is not a folder:', self.data) + for folder_item in self.contents: + # split the name by . and grab the last element for extension + item_ext = folder_item.split('.')[-1] + # only proceed if item extension is in approved list + if item_ext in options['extensions']: + # add new TextFile linked to the folder_item's location + contents.append( + self.file_class( + os.path.join(self.data, folder_item), + options=options + ) + ) + # return as a deque instead of a list + return deque(contents) diff --git a/dhelp/files/csv_file.py b/dhelp/files/csv.py similarity index 78% rename from dhelp/files/csv_file.py rename to dhelp/files/csv.py index 83fa9a0..d319029 100644 --- a/dhelp/files/csv_file.py +++ b/dhelp/files/csv.py @@ -5,8 +5,8 @@ import csv from collections import deque -from .path import Path -from .text_file import TextFile +from ._bases import BaseFile +from .txt import TextFile # prefatory code sets csv field size to the maximum of system limit # https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072 @@ -24,7 +24,7 @@ decrement = True -class CSVFile(Path): +class CSVFile(BaseFile): """Load and save CSV data with lists of dictionaries. Makes loading and saving CSV data a simple matter. Simplifies the use @@ -35,11 +35,37 @@ class CSVFile(Path): path (:obj:`str`) System path pointing to desired text file location Examples: + >>> # load a csv object, which behaves like a string >>> from dhelp import CSVFile + >>> CSVFile('some/path.csv') + '/absolute/path/to/some/path.csv' + + >>> # manually loop through and edit data, and save... + >>> csv_file = CSVFile('some/path.csv') + >>> csv_fieldnames = csv_file.fieldnames + >>> csv_data = csv_file.load() + >>> for data_row csv_data: + >>> data_row['text'] = data_row['text'].replace('\\n', '') + >>> csv_file.save(csv_data, csv_fieldnames, options={'overwrite: True'}) + + >>> # or use for/in syntax to quickly load data rows, edit, and resave >>> csv_file = CSVFile('some/path.csv') - >>> print(csv_file) - some/path.csv - """ + >>> with csv_file as data_rows: + ... for data_row in data_rows: + ... data_row['text'] = data_row['text'].replace('\\n', '') + ... csv_file.save_data = data_rows + """ # noqa + options = {} + + def __exit__(self, ctx_type, ctx_value, ctx_traceback): + # gets defaults then overrides with any specified options + options = self.options + options.update({'overwrite': True}) + fieldnames = self.fieldnames + if self.save_data: + return self.save( + self.save_data, fieldnames, options=options + ) @property def fieldnames(self): @@ -64,7 +90,7 @@ def fieldnames(self): column_headers.append(column_header) return column_headers - def load(self, options={}): + def load(self, *args, **kwargs): """Load csv as list (deque) of dictionaries. Fast way to load CSV data for editing. Returns a deque, a list-like @@ -83,20 +109,14 @@ def load(self, options={}): Examples: >>> csv_file = CSVFile('some/path.csv') - >>> csv_data = CSVFile.load() - >>> print(csv_data) + >>> CSVFile.load() [{'id': '1', 'text': 'Lorem ipsum', 'notes': ''}, {'id': '2', 'text': 'dolor sit', 'notes': ''}, {'id': '3', 'text': 'amet.', 'notes': ''}] """ # noqa - # set option defaults - if 'encoding' not in options: - options['encoding'] = 'utf-8' - if 'newline' not in options: - options['newline'] = '' - if 'dialect' not in options: - options['dialect'] = 'excel' - if 'delimiter' not in options: - options['delimiter'] = ',' - super(self.__class__, self).load(options) + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) if not self.is_file: raise Exception('Item is not a file') data_rows = deque([]) @@ -115,7 +135,7 @@ def load(self, options={}): data_rows.append(csv_row) return data_rows - def save(self, data, fieldnames, options={}): + def save(self, data, fieldnames, *args, **kwargs): """Save a list of dictionaries to a .csv file. Send a list of dictionaries and a list of their fieldnames to save to @@ -149,18 +169,14 @@ def save(self, data, fieldnames, options={}): ... 'notes': '' ... }] >>> # save to csv file - >>> csv_file = CSVFile('some/path.csv').save(fake_data, fieldnames=fake_fieldnames) - >>> print(csv_file) - /absolute/path/to/some/path.csv + >>> CSVFile('some/path.csv').save(fake_data, fieldnames=fake_fieldnames) + '/absolute/path/to/some/path.csv' """ # noqa - if 'encoding' not in options: - options['encoding'] = 'utf-8' - if 'newline' not in options: - options['newline'] = '' - if 'dialect' not in options: - options['dialect'] = 'excel' - if 'delimiter' not in options: - options['delimiter'] = ',' + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) # calling super to print messages super(self.__class__, self).save(options) with open( @@ -180,7 +196,7 @@ def save(self, data, fieldnames, options={}): csv_writer.writerow(data_row) return self - def modify(self, destination, modify_cb, options={}): + def modify(self, destination, modify_cb, *args, **kwargs): """Edit every row in the CSV by passing a function. Copies CSV to destination then performs the modify_cb callback @@ -211,6 +227,11 @@ def modify(self, destination, modify_cb, options={}): >>> print(altered_csv_file) /absolute/path/to/some/other-path.csv """ # noqa + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) # create csv object tied to destination and empty deque for new data new_csv_file = self.__class__(destination) new_data = [] @@ -225,7 +246,8 @@ def modify(self, destination, modify_cb, options={}): ) def column_to_txts( - self, destination='.', text_col='text', filename_col=None, options={} + self, destination='.', text_col='text', filename_col=None, + *args, **kwargs ): """Coverts a column of text data to a folder of .txt. @@ -248,6 +270,11 @@ def column_to_txts( >>> csv_file.column_to_txts('some/other-path', text_col='text', filename_col='id') some/path.csv """ # noqa + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) # ensure output folder is absolute path if not os.path.isabs(destination): destination = os.path.abspath(destination) diff --git a/dhelp/files/folder.py b/dhelp/files/folder.py deleted file mode 100644 index 062319a..0000000 --- a/dhelp/files/folder.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/python - -import os - -from .path import Path - - -class Folder(Path): - """Parent class for TextFolder and other folder related classes. - - Base parent class to all folder utility objects, not meant to be used on - its own. Child classes inherit these functions to work with specific - file types. - - Args: - path (:obj:`str`) System path pointing to desired folder location - - Examples: - >>> folder = Folder('some/path') - >>> print(folder) - '/absolute/path/to/some/path' - """ - - @property - def contents(self): - """Lists contents of folder. - - Returns: - :obj:`list` of :obj:`str` File/folder names. - - Example: - >>> print(Folder(some/path).files) - ['file_1.txt', 'file_2.txt', 'file_3.txt', 'subfolder_1', 'subfolder_2', 'subfolder_3'] - """ # noqa - if not self.exists or not self.is_dir: - return None - return os.listdir(self.data) - - @property - def length(self): - """Convenience method to get the len() of the folder contents. - - Returns: - :obj:`int` Number of items in the folder - - Example: - >>> print(Folder('some/path').length) - 3 - """ - return len(self.contents) - - @property - def files(self): - """Returns .contents with non-files filtered. - - Grabs names of directory contents before joining them with the current - path to return list of absolute paths to all files in the directory. - - Returns: - :obj:`list` of :obj:`str` File names - - Example: - >>> print(Folder(some/path).files) - ['/absolute/path/to/some/path/file_1.txt', '/absolute/path/to/some/path/file_2.txt', /absolute/path/to/some/path/file_3.txt] - """ # noqa - dir_files = [] - for folder_item in self.contents: - if os.path.isfile(os.path.join(self.data, folder_item)): - dir_files.append(os.path.join(self.data, folder_item)) - return dir_files - - @property - def folders(self): - """Returns .contents with non-folders filtered. - - Grabs names of directory contents before joining them with the current - path to return list of absolute paths to all folders in the directory. - - Returns: - :obj:`list` of :obj:`str` Folder names - - Example: - >>> print(Folder(some/path).folders) - ['subfolder_1', 'subfolder_2', 'subfolder_3'] - """ - dir_subdirs = [] - for folder_item in self.contents: - if os.path.isdir(os.path.join(self.data, folder_item)): - dir_subdirs.append(os.path.join(self.data, folder_item)) - return dir_subdirs diff --git a/dhelp/files/path.py b/dhelp/files/path.py deleted file mode 100644 index 38f08be..0000000 --- a/dhelp/files/path.py +++ /dev/null @@ -1,325 +0,0 @@ -#!/usr/bin/python - -import os -import errno -import shutil -from collections import UserString - - -class Path(UserString): - """ - Used to interact with a system path in various ways. Not generally meant to - be used directly, Path is parent to various Folder and File classes. - - Args: - path (:obj:`str`) System path pointing to desired text file location - - Raises: - Exception: If a non-string arg is sent as path - """ - - def __init__(self, path=None): - # call parent class constructor and set to a string - super().__init__(str) - # if no filepath specified, default to current working directory - if not path: - path = os.getcwd() - # raise error if path sent but is non-string - if type(path) is not str: - raise Exception('path is not a string') - # if relative path sent, convert to absolute path - if not os.path.isabs(path): - path = os.path.abspath(os.path.join(os.getcwd(), path)) - self.data = path - - @property - def exists(self): - """Check if anything exists at the current path. - - Returns: - :obj:`bool` True if anything exists at path, False if not - - Example: - >>> print(Path('some/extant/path').exists()) - True - >>> print(Path('some/non-extant/path').exists()) - False - - """ - return os.path.exists(self.data) - - @property - def size(self): - """Get file/folder size of anything at the current path. - - Returns the size of any item at the specified path in bytes, returns - 0 if non-extant. - - Returns: - :obj:`int` Size of item at path, in bytes - - Example: - >>> print(Path(some/path.txt)) - 121 - """ - # return zero if nothing present - if not self.exists: - return 0 - return os.path.getsize(self.data) - - @property - def basename(self): - """Get file/folder name of current path. - - Returns the basename (last element of path) of the current path - e.g. the name of the current file or folder. - - Returns: - :obj:`str` Name of current file or folder - - Example: - >>> print(Path(some/path.txt)) - path.txt - """ - return os.path.basename(self.data) - - @property - def dirname(self): - """Get parent directory path. - - Returns the absolute path of the parent directory of the current path. - - Returns: :obj:`str` Name of parent directory of current path - - Example: - >>> print(Path(some/path.txt).dirname()) - /absolute/path/to/some - - """ - return os.path.dirname(self.data) - - @property - def is_dir(self): - """Check if path is a directory. - - Returns true if path points to existing directory. - - Returns: :obj:`bool` True if path points to directory, False if not - - Examples: - >>> print(Path(some/path).is_dir()) - True - >>> print(Path(some/path.txt).is_dir()) - False - """ - return os.path.isdir(self.data) - - @property - def is_file(self): - """Check if path is a file. - - Returns true if path points to existing file. - - Returns: - :obj:`bool` True if path points to file, False if not - - Examples: - >>> print(Path(some/path.txt).is_file()) - True - >>> print(Path(some/path).is_file()) - False - """ - return os.path.isfile(self.data) - - @property - def is_link(self): - """Check if path is a link. - - Returns true if path points to symbolic link. - - Returns: - :obj:`bool` True if path points to symbolic link, False if not - - Examples: - >>> print(Path(some/link.txt).is_link()) - True - >>> print(Path(nota/link).is_link()) - False - """ - if not self.exists: - return False - return os.path.islink(self.data) - - def copy(self, destination, options={}): - """Copy data at path to another location. - - Copies the contents at system path (if a folder, copies it's contents - recursively) to a specified destination. Returns a new version of the - object linked to the new location. Will raise an error if anything - exists at the destination unless overwrite option is flagged. - - Args: - destination (:obj:`str`) System path to which you want to copy item(s) at current path - options (:obj:`dict`, optional) Options settings found at respective keywords - - Returns: - :obj:`self.__class__` New instance of object tied to the copied path - - Raises: - Exception: If a problem is encountered when copying - - Example: - >>> print(Path('some/path').copy('some/other-path')) - some/other-path - """ # noqa - # set default options - if 'encoding' not in options: - options['encoding'] = 'utf-8' - if 'overwrite' not in options: - options['overwrite'] = False - # ensure is an absolute path - if not os.path.isabs(destination): - destination = os.path.abspath(destination) - # if destination already exists and overwrite option not set, abort - if os.path.exists(destination) and not options['overwrite']: - raise Exception('Cannot copy, item exists at ' + str(destination)) - # attempt to copy location recursively - try: - if self.is_file: - shutil.copy(self.data, destination) - else: - shutil.copytree(self.data, destination) - # raise exception msg if error encountered - except: - raise Exception( - 'Error copying. Source:', - self.data, - 'Destination', - destination - ) - # return new version of object that is linked to copied location - return self.__class__(destination) - - def remove(self): - """Delete item(s) at current path. - - Deletes any item at the current path. If a folder deletes contents - recursively. Returns True if successful. - - Returns: - :obj:`bool` True if successful - - Raises: - Exception: If any issue was encountered deleting item(s) at path - - Example: - >>> print(Path(some/path).remove()) - True - """ - try: - if self.is_file: - os.remove(self.data) - else: - shutil.rmtree(self.data) - except: - raise Exception('Error removing item at ' + self.data) - return True - - def move(self, destination, options={}): - """Moves item(s) from current path to another location. - - Effectively moves anything at the given path to the specified location. - Calls .copy() with destination, then .remove() the current path, before - finally the results of .copy(). - - Args: - destination (:obj:`str`) System path to which you want to move item(s) at current path - options (:obj:`dict`, optional) Options settings found at respective keywords - - Returns: - :obj:`self.__class__` New instance of object tied to destination path - - Example: - >>> print(Path('some/path').move('some/other-path')) - some/other-path - """ # noqa - if 'encoding' not in options: - options['encoding'] = 'utf-8' - if 'overwrite' not in options: - options['overwrite'] = False - new_path_obj = self.copy(destination, options=options) - self.remove() - return new_path_obj - - def load(self, options={}): - """Loading method called by child classes. - - Called by child class load methods, stops from loading non-extant file. - - Args: - options :obj:`dict`, optional Options settings found at respective keywords - - Raises: - Exception: If nothing exists at path - """ # noqa - # set options defaults - if 'encoding' not in options: - options['encoding'] = 'utf-8' - if 'silent' not in options: - options['silent'] = False - # print loading message if silent option not flagged - if not options['silent']: - print('Loading', self.data) - if not self.exists: - raise Exception('Cannot open item, nothing exists at' + self.data) - - def save(self, options={}): - """Saving method called by child classes. - - Called by child class save methods, prevents overwrite without option. - - Args: - options :obj:`dict`, optional Options settings found at respective keywords - - Raises: - Exception: If something exists at path and overwrite option is not set - """ # noqa - # set option defaults - if 'encoding' not in options: - options['encoding'] = 'utf-8' - if 'overwrite' not in options: - options['overwrite'] = False - if 'silent' not in options: - options['silent'] = False - # print saving message if silent option not flagged - if not options['silent']: - print('Saving to', self.data) - if self.exists and not options['overwrite']: - raise Exception( - 'Item exists at ' + self.data + ' and overwrite not specified' - ) - # create all parent directories required for save - self.makedirs() - return self - - def makedirs(self): - """Create any missing parent directories of current path. - - Automatically creates any parent directories of the current path - that do not already exist. This function is used by the .save() - method before saving to a location to avoid errors. - - Example: - >>> Path(some/path).makedirs() - some/path - """ - # if parent directory is non-extant - if not os.path.exists(os.path.dirname(self.data)): - # attempt to make parent directories - try: - os.makedirs(os.path.dirname(self.data)) - # raise an error if somehow directories were created after check - except OSError as exc: - if exc.errno != errno.EEXist: - raise - return self diff --git a/dhelp/files/tests/__init__.py b/dhelp/files/tests/__init__.py index e69de29..013e4b7 100644 --- a/dhelp/files/tests/__init__.py +++ b/dhelp/files/tests/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/python diff --git a/dhelp/files/tests/abc_case.py b/dhelp/files/tests/abc_case.py deleted file mode 100644 index f770b6f..0000000 --- a/dhelp/files/tests/abc_case.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/python - -import os - -import unittest - - -class AbstractBaseUnitTest(unittest.TestCase): - test_class = None - fixtures_path = None - - def make_test_obj(self, path=None): - # build path from either fixtures_path or path if present - if path and self.fixtures_path: - path = os.path.join(self.fixtures_path, path) - elif not path and self.fixtures_path: - path = self.fixtures_path - # if no path given in either place, raise exception - elif not path and not self.fixtures_path: - raise Exception('No fixtures_path or path specified for unittest.') - return self.test_class(path) - - -if __name__ == "__main__": - unittest.main() diff --git a/dhelp/files/tests/fixtures/csv/fake_data.csv b/dhelp/files/tests/fixtures/csv/fake_data.csv index dc8b02f..8d15162 100644 --- a/dhelp/files/tests/fixtures/csv/fake_data.csv +++ b/dhelp/files/tests/fixtures/csv/fake_data.csv @@ -1,6 +1,6 @@ "id","text","notes" "1","This is the first record","First footnotes" -"1","This is the second record","Second footnotes" -"1","This is the third record","Third footnotes" -"1","This is the fourth record","Fourth footnotes" -"1","This is the fifth record","Fifth footnotes" +"2","This is the second record","Second footnotes" +"3","This is the third record","Third footnotes" +"4","This is the fourth record","Fourth footnotes" +"5","This is the fifth record","Fifth footnotes" diff --git a/dhelp/files/tests/test_bases.py b/dhelp/files/tests/test_bases.py new file mode 100644 index 0000000..2d2b6b9 --- /dev/null +++ b/dhelp/files/tests/test_bases.py @@ -0,0 +1,175 @@ +#!/usr/bin/python + +import unittest + +import os +import shutil + +from .._bases import BaseFile, BaseFolder + + +fixtures_src = os.path.join( + os.path.dirname(__file__), + 'fixtures', + 'txt' +) +fixtures_dest = os.path.join( + os.path.dirname(__file__), + 'fixtures', + '.testing' +) +options = { + 'silent': False +} + + +class TextFixturesLayer: + + @classmethod + def testSetUp(cls): + # remove any extant temp fixture files + if os.path.exists(fixtures_dest): + shutil.rmtree(fixtures_dest) + # ensure requisite parent dirs created, make them if not + if not os.path.exists(os.path.dirname(fixtures_dest)): + os.makedirs(os.path.dirname(fixtures_dest)) + # copy fixture files to temp dir + shutil.copytree(fixtures_src, fixtures_dest) + + @classmethod + def testTearDown(cls): + # destroy any temporary fixture files remaining + modified_fixtures_path = os.path.join( + os.path.dirname(fixtures_dest), + '.testing-modified' + ) + if os.path.exists(fixtures_dest): + shutil.rmtree(fixtures_dest) + if os.path.exists(modified_fixtures_path): + shutil.rmtree(modified_fixtures_path) + + +class TestBaseFile(unittest.TestCase): + layer = TextFixturesLayer + + def test_error_non_string(self): + # should error if not sent a string + return self.assertRaises(Exception, lambda: BaseFile(2)) + + def test_default_path(self): + # should generate a default path if none given + exempla = BaseFile() + return self.assertTrue(os.path.exists(str(exempla))) + + def test_relative_path(self): + # should build an absolute path when given a relative + exempla = BaseFile('file.txt') + comparanda = os.path.join( + os.getcwd(), + 'file.txt' + ) + return self.assertEqual(exempla, comparanda) + + def test_exists(self): + # should return true since file exists + exempla = BaseFile(os.path.join(fixtures_dest, 'fake_data_1.txt')) + return self.assertTrue(exempla.exists) + + def test_size(self): + # should have a file size greater than 0 + exempla = BaseFile(os.path.join(fixtures_dest, 'fake_data_1.txt')) + return self.assertTrue(exempla.size > 0) + + def test_non_extant_size(self): + # should return 0 when doesn't exist + exempla = BaseFile('file.txt') + return self.assertTrue(exempla.size == 0) + + def test_basename(self): + # should return filename correctly + exempla = BaseFile(os.path.join(fixtures_dest, 'fake_data_1.txt')) + comparanda = 'fake_data_1.txt' + return self.assertEqual(exempla.basename, comparanda) + + def test_dirname(self): + # should return parent folder name correctly + exempla = BaseFile(os.path.join(fixtures_dest, 'fake_data_1.txt')) + comparanda = os.path.dirname(str(exempla)) + return self.assertEqual(exempla.dirname, comparanda) + + def test_is_file(self): + # should return true + exempla = BaseFile(os.path.join(fixtures_dest, 'fake_data_1.txt')) + return self.assertTrue(exempla.is_file) + + def test_is_dir(self): + # should return false + exempla = BaseFile(os.path.join(fixtures_dest, 'fake_data_1.txt')) + return self.assertFalse(exempla.is_dir) + + def test_is_link(self): + # should return false + exempla = BaseFile(os.path.join(fixtures_dest, 'fake_data_1.txt')) + return self.assertFalse(exempla.is_link) + + def test_makedirs(self): + # should automatically make parent dirs of path + exempla = BaseFile(os.path.join(fixtures_dest, 'fake_data_1.txt')) + exempla.makedirs(options={'silent': False}) + return self.assertTrue(os.path.exists(os.path.dirname(str(exempla)))) + + def test_copy(self): + # should return new path object, which should exist + exempla = BaseFile( + os.path.join(fixtures_dest, 'fake_data_1.txt'), + options={'silent': False} + ).copy(os.path.join(fixtures_dest, 'fake_data_1_copy.txt')) + return self.assertTrue(os.path.exists(str(exempla))) + + def test_remove(self): + # should remove temp testing file + exempla = BaseFile(os.path.join(fixtures_dest, 'fake_data_1.txt')) + exempla.remove(options={'silent': False}) + return self.assertFalse(os.path.exists(str(exempla))) + + def test_move(self): + # should copy temp testing file + exempla = BaseFile(os.path.join(fixtures_dest, 'fake_data_1.txt')) + comparanda = exempla.move( + os.path.join(fixtures_dest, 'fake_data_1_copy.txt'), + {'silent': False} + ) + return self.assertTrue( + os.path.exists(str(comparanda)) + and not + os.path.exists(str(exempla)) + ) + + +class TestBaseFolder(unittest.TestCase): + layer = TextFixturesLayer + + def test_contents(self): + # should have 5 items in the folder + exempla = BaseFolder(fixtures_dest) + return self.assertTrue(len(exempla.contents) == 5) + + def test_filenames(self): + # should be 5 items in the folder + exempla = BaseFolder(fixtures_dest) + return self.assertTrue(len(exempla.filenames) == 5) + + def test_folders(self): + # should be no length since there area no folders + exempla = BaseFolder(fixtures_dest) + return self.assertEqual(len(exempla.folders), 0) + + def test_length(self): + # should have 5 items in the folder + exempla = BaseFolder(fixtures_dest) + return self.assertTrue(exempla.length == 5) + + def test_text_files(self): + # should have 5 items in the folder + exempla = BaseFolder(fixtures_dest) + return self.assertTrue(len(exempla.files()) == 5) diff --git a/dhelp/files/tests/test_csv.py b/dhelp/files/tests/test_csv.py new file mode 100644 index 0000000..f81bcfe --- /dev/null +++ b/dhelp/files/tests/test_csv.py @@ -0,0 +1,175 @@ +#!/usr/bin/python + +import unittest + +import os +import csv +import shutil + +from ..csv import CSVFile + + +fixtures_src = os.path.join( + os.path.dirname(__file__), + 'fixtures', + 'csv' +) +fixtures_dest = os.path.join( + os.path.dirname(__file__), + 'fixtures', + '.testing' +) +options = { + 'silent': False +} + + +class CSVFileLayer: + + @classmethod + def testSetUp(cls): + # remove any extant temp fixture files + if os.path.exists(fixtures_dest): + shutil.rmtree(fixtures_dest) + # ensure requisite parent dirs created, make them if not + if not os.path.exists(os.path.dirname(fixtures_dest)): + os.makedirs(os.path.dirname(fixtures_dest)) + # copy fixture files to temp dir + shutil.copytree(fixtures_src, fixtures_dest) + + @classmethod + def testTearDown(cls): + # destroy any temporary fixture files remaining + if os.path.exists(fixtures_dest): + shutil.rmtree(fixtures_dest) + + +class TestCSVFile(unittest.TestCase): + layer = CSVFileLayer + + def test_load(self): + # first record should match + exempla = CSVFile( + os.path.join(fixtures_dest, 'fake_data.csv'), + options=options + ) + exempla = exempla.load()[0]['text'] + comparanda = 'This is the first record' + return self.assertEqual(exempla, comparanda) + + def test_save(self): + # should correctly modified the first record + csv_records = [] + # manually open csv file + with open( + os.path.join( + fixtures_dest, + 'fake_data.csv' + ), + 'r+' + ) as csv_file: + csv_reader = csv.DictReader(csv_file) + for csv_record in csv_reader: + csv_records.append(csv_record) + # alter first record, then save to file + csv_records[0]['text'] = 'Altered test record' + exempla = CSVFile( + os.path.join(fixtures_dest, 'fake_data.csv'), + options={'overwrite': True, 'silent': False} + ) + exempla.save( + csv_records, + fieldnames=['id', 'text', 'notes'], + ) + # manually reopen csv file to check for results + csv_records = [] + with open( + os.path.join( + fixtures_dest, + 'fake_data.csv' + ), + 'r+' + ) as csv_file: + csv_reader = csv.DictReader(csv_file) + for csv_record in csv_reader: + csv_records.append(csv_record) + return self.assertEqual(csv_records[0]['text'], 'Altered test record') + + def test_modify(self): + # should have modified first record + + def modify_function(csv_record): + csv_record['text'] = 'Altered test record' + return csv_record + + exempla = CSVFile( + os.path.join(fixtures_dest, 'fake_data.csv'), + options={'silent': False, 'overwrite': True} + ) + exempla.modify( + os.path.join(fixtures_dest, 'fake_data_modified.csv'), + modify_function + ) + # manually reopen csv file to check for results + csv_records = [] + with open( + os.path.join( + fixtures_dest, + 'fake_data_modified.csv' + ), + 'r+' + ) as csv_file: + csv_reader = csv.DictReader(csv_file) + for csv_record in csv_reader: + csv_records.append(csv_record) + return self.assertEqual(csv_records[4]['text'], 'Altered test record') + + def test_column_to_txts(self): + # should produce a folder of .txt files + exempla = '' + comparanda = 'This is the first record' + destination = os.path.join( + fixtures_dest, + 'csv', + 'txt' + ) + CSVFile( + os.path.join(fixtures_dest, 'fake_data.csv'), + options=options + ).column_to_txts( + destination=destination, + text_col='text', + filename_col='id', + options={'overwrite': True} + ) + # open file manually to check for match + with open( + os.path.join(fixtures_dest, 'csv', 'txt', '1.txt'), + mode='r+' + ) as readfile: + exempla = readfile.read() + return self.assertEqual(exempla, comparanda) + + def test_context_manager(self): + exempla = CSVFile( + os.path.join(fixtures_dest, 'fake_data.csv'), + options=options + ) + comparanda = 'Testing file' + with exempla as data_rows: + edited_rows = data_rows + for edited_row in edited_rows: + edited_row['text'] = 'Testing file' + exempla.save_data = edited_rows + # load manually to check + with open( + os.path.join( + fixtures_dest, + 'fake_data.csv' + ), + mode='r+' + ) as csv_file: + csv_reader = csv.DictReader(csv_file) + # get value from text column of first row + exempla = next(csv_reader)['text'] + return self.assertEqual(exempla, comparanda) diff --git a/dhelp/files/tests/test_csv_file.py b/dhelp/files/tests/test_csv_file.py deleted file mode 100644 index 2a83b4f..0000000 --- a/dhelp/files/tests/test_csv_file.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/python - -import os -import csv -import shutil - -from .abc_case import AbstractBaseUnitTest - -from ..csv_file import CSVFile - - -fixtures_path = os.path.join( - os.path.dirname(__file__), - 'fixtures', -) -fixture_dir = 'csv' -temp_dir = '.testing' - - -class CSVFileLayer: - - @classmethod - def testSetUp(cls): - source = os.path.join(fixtures_path, fixture_dir) - destination = os.path.join(fixtures_path, temp_dir) - # remove any extant temp fixture files - if os.path.exists(destination): - shutil.rmtree(destination) - # ensure requisite parent dirs created, make them if not - if not os.path.exists(os.path.dirname(destination)): - os.makedirs(os.path.dirname(destination)) - # copy fixture files to temp dir - shutil.copytree(source, destination) - - @classmethod - def testTearDown(cls): - # destroy any temporary fixture files remaining - destination = os.path.join(fixtures_path, temp_dir) - if os.path.exists(destination): - shutil.rmtree(destination) - - -class TestCSVFile(AbstractBaseUnitTest): - layer = CSVFileLayer - test_class = CSVFile - fixtures_path = os.path.join(fixtures_path, temp_dir) - - def test_load(self): - # first record should match - exempla = self.make_test_obj('fake_data.csv') - exempla = exempla.load(options={'silent': True})[0]['text'] - comparanda = 'This is the first record' - return self.assertEqual(exempla, comparanda) - - def test_save(self): - # should correctly modified the first record - csv_records = [] - # manually open csv file - with open( - os.path.join( - fixtures_path, - temp_dir, - 'fake_data.csv' - ), - 'r+' - ) as csv_file: - csv_reader = csv.DictReader(csv_file) - for csv_record in csv_reader: - csv_records.append(csv_record) - # alter first record, then save to file - csv_records[0]['text'] = 'Altered test record' - exempla = self.make_test_obj('fake_data.csv') - exempla.save( - csv_records, - fieldnames=['id', 'text', 'notes'], - options={'overwrite': True, 'silent': True} - ) - # manually reopen csv file to check for results - csv_records = [] - with open( - os.path.join( - fixtures_path, - temp_dir, - 'fake_data.csv' - ), - 'r+' - ) as csv_file: - csv_reader = csv.DictReader(csv_file) - for csv_record in csv_reader: - csv_records.append(csv_record) - return self.assertEqual(csv_records[0]['text'], 'Altered test record') - - def test_modify(self): - # should have modified first record - - def modify_function(csv_record): - csv_record['text'] = 'Altered test record' - return csv_record - - exempla = self.make_test_obj('fake_data.csv') - exempla.modify( - os.path.join(fixtures_path, temp_dir, 'fake_data_modified.csv'), - modify_function, - options={'silent': True} - ) - # manually reopen csv file to check for results - csv_records = [] - with open( - os.path.join( - fixtures_path, - temp_dir, - 'fake_data_modified.csv' - ), - 'r+' - ) as csv_file: - csv_reader = csv.DictReader(csv_file) - for csv_record in csv_reader: - csv_records.append(csv_record) - return self.assertEqual(csv_records[4]['text'], 'Altered test record') - - def test_column_to_txts(self): - # should have produced a folder of .txt files - pass diff --git a/dhelp/files/tests/test_folder.py b/dhelp/files/tests/test_folder.py deleted file mode 100644 index 6ee87bf..0000000 --- a/dhelp/files/tests/test_folder.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/python - -import os -import shutil - -from .abc_case import AbstractBaseUnitTest - -from ..folder import Folder - - -fixtures_path = os.path.join( - os.path.dirname(__file__), - 'fixtures', -) -fixture_dir = 'txt' -temp_dir = '.testing' - - -class FolderLayer: - - @classmethod - def testSetUp(cls): - source = os.path.join(fixtures_path, fixture_dir) - destination = os.path.join(fixtures_path, temp_dir) - # remove any extant temp fixture files - if os.path.exists(destination): - shutil.rmtree(destination) - # ensure requisite parent dirs created, make them if not - if not os.path.exists(os.path.dirname(destination)): - os.makedirs(os.path.dirname(destination)) - # copy fixture files to temp dir - shutil.copytree(source, destination) - - @classmethod - def testTearDown(cls): - # destroy any temporary fixture files remaining - destination = os.path.join(fixtures_path, temp_dir) - if os.path.exists(destination): - shutil.rmtree(destination) - - -class TestFolder(AbstractBaseUnitTest): - layer = FolderLayer - test_class = Folder - fixtures_path = os.path.join(fixtures_path, temp_dir) - - def test_contents(self): - # should have 5 items in the folder - exempla = self.make_test_obj() - return self.assertTrue(len(exempla.contents) == 5) - - def test_files(self): - # should be 5 items in the folder - exempla = self.make_test_obj() - return self.assertTrue(len(exempla.files) == 5) - - def test_folders(self): - # should be no length since there area no folders - exempla = self.make_test_obj() - return self.assertEqual(len(exempla.folders), 0) - - def test_length(self): - # should have 5 items in the folder - exempla = self.make_test_obj() - return self.assertTrue(exempla.length == 5) diff --git a/dhelp/files/tests/test_text_file.py b/dhelp/files/tests/test_text_file.py deleted file mode 100644 index 11ebeb7..0000000 --- a/dhelp/files/tests/test_text_file.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/python - -import os -import shutil - -from .abc_case import AbstractBaseUnitTest - -from ..text_file import TextFile - - -fixtures_path = os.path.join( - os.path.dirname(__file__), - 'fixtures', -) -fixture_dir = 'txt' -temp_dir = '.testing' - - -class TextFileLayer: - - @classmethod - def testSetUp(cls): - source = os.path.join(fixtures_path, fixture_dir) - destination = os.path.join(fixtures_path, temp_dir) - # remove any extant temp fixture files - if os.path.exists(destination): - shutil.rmtree(destination) - # ensure requisite parent dirs created, make them if not - if not os.path.exists(os.path.dirname(destination)): - os.makedirs(os.path.dirname(destination)) - # copy fixture files to temp dir - shutil.copytree(source, destination) - - @classmethod - def testTearDown(cls): - # destroy any temporary fixture files remaining - destination = os.path.join(fixtures_path, temp_dir) - if os.path.exists(destination): - shutil.rmtree(destination) - - -class TestTextFile(AbstractBaseUnitTest): - layer = TextFileLayer - test_class = TextFile - fixtures_path = os.path.join(fixtures_path, temp_dir) - - def test_exists(self): - # should return true since file exists - exempla = self.make_test_obj('fake_data_1.txt') - return self.assertTrue(exempla.exists) - - def test_size(self): - # should have a file size greater than 0 - exempla = self.make_test_obj('fake_data_1.txt') - return self.assertTrue(exempla.size > 0) - - def test_basename(self): - # should return filename correctly - exempla = self.make_test_obj('fake_data_1.txt') - comparanda = 'fake_data_1.txt' - return self.assertEqual(exempla.basename, comparanda) - - def test_dirname(self): - # should return parent folder name correctly - exempla = self.make_test_obj('fake_data_1.txt') - comparanda = os.path.dirname(str(exempla)) - return self.assertEqual(exempla.dirname, comparanda) - - def test_is_file(self): - # should return true - exempla = self.make_test_obj('fake_data_1.txt') - return self.assertTrue(exempla.is_file) - - def test_is_dir(self): - # should return false - exempla = self.make_test_obj('fake_data_1.txt') - return self.assertFalse(exempla.is_dir) - - def test_is_link(self): - # should return false - exempla = self.make_test_obj('fake_data_1.txt') - return self.assertFalse(exempla.is_link) - - def test_makedirs(self): - # should automatically make parent dirs of path - exempla = self.make_test_obj(os.path.join('subdir', 'fake_data_1.txt')) - exempla.makedirs() - return self.assertTrue(os.path.exists(os.path.dirname(str(exempla)))) - - def test_copy(self): - # should return new path object - exempla = self.make_test_obj('fake_data_1.txt') - exempla = exempla.copy( - os.path.join(fixtures_path, temp_dir, 'fake_data_1_copy.txt') - ) - return self.assertTrue(os.path.exists(str(exempla))) - - def test_remove(self): - # should remove temp testing file - exempla = self.make_test_obj('fake_data_1.txt') - exempla.remove() - return self.assertFalse(os.path.exists(str(exempla))) - - def test_move(self): - # should copy temp testing file - exempla = self.make_test_obj('fake_data_1.txt') - comparanda = exempla.move( - os.path.join(fixtures_path, temp_dir, 'fake_data_1_copy.txt') - ) - return self.assertTrue( - os.path.exists(str(comparanda)) and - not os.path.exists(str(exempla)) - ) - - def test_load(self): - # first line of loaded content should match - exempla = self.make_test_obj('fake_data_1.txt') - exempla = exempla.load(options={'silent': True}).split('\n')[0] - comparanda = 'First test file' - return self.assertEqual(exempla, comparanda) - - def test_save_no_overwrite(self): - # should raise exception if overwrite is not specified - exemplum = self.make_test_obj('fake_data_1.txt') - return self.assertRaises(Exception, lambda: exemplum.save( - 'Altered test file', - options={'silent': True} - )) - - def test_save_overwrite(self): - # should save altered testing file - exempla = None - comparanda = 'Altered test file' - self.make_test_obj('fake_data_1.txt').save( - comparanda, options={'overwrite': True, 'silent': True} - ) - with open( - os.path.join(fixtures_path, temp_dir, 'fake_data_1.txt') - ) as test_file: - exempla = test_file.read() - return self.assertEqual(exempla, comparanda) diff --git a/dhelp/files/tests/test_text_folder.py b/dhelp/files/tests/test_text_folder.py deleted file mode 100644 index f52b458..0000000 --- a/dhelp/files/tests/test_text_folder.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/python - -import os -import shutil - -from .abc_case import AbstractBaseUnitTest - -from ..text_folder import TextFolder - - -fixtures_path = os.path.join( - os.path.dirname(__file__), - 'fixtures', -) -fixture_dir = 'txt' -temp_dir = '.testing' - - -class TextFolderLayer: - - @classmethod - def testSetUp(cls): - source = os.path.join(fixtures_path, fixture_dir) - destination = os.path.join(fixtures_path, temp_dir) - # remove any extant temp fixture files - if os.path.exists(destination): - shutil.rmtree(destination) - # ensure requisite parent dirs created, make them if not - if not os.path.exists(os.path.dirname(destination)): - os.makedirs(os.path.dirname(destination)) - # copy fixture files to temp dir - shutil.copytree(source, destination) - - @classmethod - def testTearDown(cls): - # destroy any temporary fixture files remaining - destination = os.path.join(fixtures_path, temp_dir) - if os.path.exists(destination): - shutil.rmtree(destination) - - -class TestTextFolder(AbstractBaseUnitTest): - layer = TextFolderLayer - test_class = TextFolder - fixtures_path = os.path.join(fixtures_path, temp_dir) - - def test_text_files(self): - # should have 5 items in the folder - exempla = self.make_test_obj() - return self.assertTrue(len(exempla.text_files()) == 5) - - def test_modify(self): - # should see changes after modifying files with a cb function - - def modify_file_function(record): - return comparanda - - exempla = '' - comparanda = 'Altered test file' - destination = os.path.join(fixtures_path, temp_dir, 'test') - # perform modification - self.make_test_obj().modify( - destination, - modify_file_function, - options={'silent': True} - ) - # open file to check for success - with open( - os.path.join(fixtures_path, temp_dir, 'test', 'fake_data_1.txt') - ) as test_file: - exempla = test_file.read() - return self.assertEqual(exempla, comparanda) diff --git a/dhelp/files/tests/test_txt.py b/dhelp/files/tests/test_txt.py new file mode 100644 index 0000000..95ceac3 --- /dev/null +++ b/dhelp/files/tests/test_txt.py @@ -0,0 +1,154 @@ +#!/usr/bin/python + +import unittest + +import os +import shutil + +from ..txt import TextFile, TextFolder + + +fixtures_src = os.path.join( + os.path.dirname(__file__), + 'fixtures', + 'txt' +) +fixtures_dest = os.path.join( + os.path.dirname(__file__), + 'fixtures', + '.testing' +) +options = { + 'silent': False +} + + +class TextFixturesLayer: + + @classmethod + def testSetUp(cls): + # remove any extant temp fixture files + if os.path.exists(fixtures_dest): + shutil.rmtree(fixtures_dest) + # ensure requisite parent dirs created, make them if not + if not os.path.exists(os.path.dirname(fixtures_dest)): + os.makedirs(os.path.dirname(fixtures_dest)) + # copy fixture files to temp dir + shutil.copytree(fixtures_src, fixtures_dest) + + @classmethod + def testTearDown(cls): + # destroy any temporary fixture files remaining + modified_fixtures_path = os.path.join( + os.path.dirname(fixtures_dest), + '.testing-modified' + ) + if os.path.exists(fixtures_dest): + shutil.rmtree(fixtures_dest) + if os.path.exists(modified_fixtures_path): + shutil.rmtree(modified_fixtures_path) + + +class TestTextFile(unittest.TestCase): + layer = TextFixturesLayer + + def test_load(self): + # first line of loaded content should match + exempla = TextFile( + os.path.join(fixtures_dest, 'fake_data_1.txt'), + options={'silent': False} + ) + exempla = exempla.load().split('\n')[0] + comparanda = 'First test file' + return self.assertEqual(exempla, comparanda) + + def test_save_no_overwrite(self): + # should raise exception if overwrite is not specified + exempla = TextFile( + os.path.join(fixtures_dest, 'fake_data_1.txt'), + options={'silent': False, 'overwrite': False} + ) + return self.assertRaises(Exception, lambda: exempla.save( + 'Altered test file', + )) + + def test_save_overwrite(self): + # should save altered testing file + exempla = TextFile( + os.path.join(fixtures_dest, 'fake_data_1.txt'), + options={'silent': False} + ) + comparanda = 'Altered test file' + exempla.save( + comparanda, options={'overwrite': True, 'silent': False} + ) + with open( + os.path.join(fixtures_dest, 'fake_data_1.txt') + ) as test_file: + exempla = test_file.read() + return self.assertEqual(exempla, comparanda) + + def test_context_manager(self): + exempla = TextFile( + os.path.join(fixtures_dest, 'fake_data_1.txt'), + options={'silent': False} + ) + comparanda = 'Testing message' + with exempla as file_data: + exempla.save_data = 'Testing message' + with open( + os.path.join(fixtures_dest, 'fake_data_1.txt'), + 'r+', + encoding='utf-8' + ) as file_data: + exempla = file_data.read() + return self.assertEqual(exempla, comparanda) + + +class TestTextFolder(unittest.TestCase): + layer = TextFixturesLayer + + def test_modify(self): + # should see changes after modifying files with a cb function + + def modify_file_function(record): + return comparanda + + exempla = '' + comparanda = 'Altered test file' + # perform modification + TextFolder(fixtures_dest, options={'silent': False}).modify( + os.path.join( + os.path.dirname(fixtures_dest), + '.testing-modified' + ), + modify_file_function, + options={'silent': False, 'overwrite': True} + ) + # open file to check for success + with open( + os.path.join( + os.path.dirname(fixtures_dest), + '.testing-modified', + 'fake_data_1.txt' + ), + ) as test_file: + exempla = test_file.read() + return self.assertEqual(exempla, comparanda) + + def test_context_manager(self): + exempla = None + comparanda = 'Testing message' + with TextFolder(fixtures_dest, options={'silent': False}) as txt_files: + for txt_file in txt_files: + with txt_file as txt_data: + txt_file.save_data = txt_data + txt_file.save_data = 'Testing message' + with open( + os.path.join( + fixtures_dest, + 'fake_data_1.txt' + ), + ) as file_data: + exempla = file_data.read() + return self.assertEqual(exempla, comparanda) diff --git a/dhelp/files/text_file.py b/dhelp/files/text_file.py deleted file mode 100644 index 1a1f6a0..0000000 --- a/dhelp/files/text_file.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/python - -from .path import Path - - -class TextFile(Path): - """Load and save data quickly to path specified. - - Represents the plain text file at the path specified. Loads data - located at given path as a string. Likewise if .save() will save string - data at the system path send to TextFile. - - This object can be used by itself, or can be constructed automatically by - using TextFolder. - - Args: - path (:obj:`str`) System path pointing to desired text file location - - Attributes: - exists (:obj:`bool`) Whether or not a file exists at the location - size (:obj:`int`) Size of item(s) stored at current location - basename (:obj:`str`) Name of current file - dirname (:obj:`str`) Full path to file's parent directory - - Examples: - >>> from dhelp import TextFile - >>> text_file = TextFile('some/path.txt') - >>> print(text_file) - some/path.txt - """ - - def load(self, options={}): - """Opens the file data as a single string. - - Opens the file using 'utf-8' unless otherwise specified in options. - Returns data as a string unless 'readlines' option is specified, in - which case data is returned as a list of strings. - - Args: - options (:obj:`dict`, optional) Options settings found at respective keywords - - Raises: - Exception: If path does not point to a file - - Examples: - >>> file_data = TextFile('some/path.txt').load() - >>> print(file_data) - Lorem ipsum dolor sit amet... - """ # noqa - # set option defaults - if 'encoding' not in options: - options['encoding'] = 'utf-8' - if 'readlines' not in options: - options['readlines'] = False - super(self.__class__, self).load(options) - if not self.is_file: - raise Exception('Item is not a file') - file_data = '' - with open(self.data, 'r+', encoding=options['encoding']) as read_file: - # if option specified, return as list of text lines - if options['readlines']: - file_data = read_file.readlines() - # normally return entire data as single string - else: - file_data = read_file.read() - return file_data - - def save(self, data, options={}): - """Saves string data to file. - - Receives string data and writes it to a file. If a list is received, - it rejoins the list with endlines before saving. If anything exists - at the current path, an exception will be raised unless the 'overwrite' - option it set. - - Args: - data (:obj:`str`) Data to be saved to file, must be a single string - options (:obj:`dict`, optional) Options settings found at respective keywords - - Examples: - >>> # saving to a new location - >>> saved_text_file = TextFile('some/path.txt').save('Lorem ipsum dolor sit amet...') # noqa - >>> print(saved_text_file) # noqa - '/absolute/path/to/some/path.txt' - - >>> # setting overwrite option - >>> options = {'overwrite': True} - >>> saved_text_file = saved_text_file.save('consectetur adipiscing elit', options=options) - >>> print(saved_text_file) - /absolute/path/to/some/path.txt - """ - # set option defaults - if 'encoding' not in options: - options['encoding'] = 'utf-8' - if 'overwrite' not in options: - options['overwrite'] = False - super(self.__class__, self).save(options) - with open(self.data, 'w+', encoding=options['encoding']) as write_file: - write_file.write(data) - return True diff --git a/dhelp/files/text_folder.py b/dhelp/files/text_folder.py deleted file mode 100644 index a2d6cdf..0000000 --- a/dhelp/files/text_folder.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/python - -import os -from collections import deque - -from .folder import Folder -from .text_file import TextFile - - -class TextFolder(Folder): - """ Load or save a folder of plaintext files as a list of strings. - - Object for interacting with a folder of plain text files. Allows quick - discovery of filepaths and construction of relevant TextFile objects. Also - enables batch editing of an entire directory by passing a callback. - - Args: - path (:obj:`str`) System path pointing to desired text folder location - - Examples: - >>> from dhelp import TextFolder - >>> text_folder = TextFolder('some/path') - >>> print(text_folder) - some/path - """ - - def text_files(self, options={}): - """ Load all .txt files as TextFile objects. - - All current .txt files inside the folder at the current path will - be returned as a deque(list) of TextFile objects. You can set which - file extensions will be loaded with the 'extensions' option by passing - a list of string extensions (without the '.'). - - Args: - options (:obj:`dict`, optional) Options settings found at respective keywords - - Returns: - :obj:`collections.deque` of `:obj:`dhelp.TextFile` TextFiles of each .txt file (or other filetype) - - Raises: - Exception: If path does not point to folder - TypeError: If non-list is sent as extensions option - - Examples: - >>> folder_files = TextFolder('some/path').text_files() - >>> for folder_file in folder_files: - ... print(folder_file.load()) - Lorem ipsum dolor sit amet... - """ # noqa - contents = deque([]) - # set option defaults - if 'encoding' not in options: - options['encoding'] = 'utf-8' - if 'extensions' not in options: - options['extensions'] = ['txt'] - if type(options['extensions']) is not list: - raise TypeError('Option "extensions" must be list') - if not self.is_dir: - raise Exception('Item is not a folder') - for folder_item in self.contents: - # split the name by . and grab the last element for extension - item_ext = folder_item.split('.')[-1] - # only proceed if item extension is in approved list - if item_ext in options['extensions']: - # add new TextFile linked to the folder_item's location - contents.append(TextFile(os.path.join(self.data, folder_item))) - # return as a deque instead of a list - return deque(contents) - - def modify(self, destination, modify_cb, options={}): - """ Edit and save every file in the folder by passing a function. - - Opens every file and performs a callback function sent to it. Provides - a fast means of batch editing an entire folder of txt files. Returns - a new TextFolder linked with the modified copy. - - The callback function should have only one argument (e.g. record_data) - which represents the data of any given file, in string format (see - example below). Whatever the function returns is what will be - saved to the modified file, as long as it is a string. - - Args: - destination (:obj:`string`) System path where you want the altered folder to be saved - modifycb (:obj:`function`) User-defined function used to modify each record's data - options (:obj:`dict`, optional) Options settings found at respective keywords - - Returns: - :obj:`self.__class__` New TextFolder object tied to the modified folder - - Examples: - >>> # define a function which alters data as you wish - >>> def modify_record(record_data): - >>> record_data = record_data.replace('\\n', '') - >>> return record_data - - >>> # if you don't specify destination, a backup will be made - >>> options = {'destination': 'some/other-path'} - - >>> # use TextFolder().modify, pass your function as 1st arg - >>> text_folder = TextFolder('some/path').modify(modify_record, options=options) - >>> print(text_folder) - /absolute/path/to/some/path - """ # noqa - # set option defaults - if 'encoding' not in options: - options['encoding'] = 'utf-8' - if 'extensions' not in options: - options['extensions'] = ['txt'] - if 'overwrite' not in options: - options['overwrite'] = True - if 'silent' not in options: - options['silent'] = True - modified_folder = self.copy(destination, options=options) - for item_file in modified_folder.text_files(): - item_data = modify_cb(item_file.load(options=options)) - item_file.save(item_data, options=options) - # return self upon success - return modified_folder diff --git a/dhelp/files/txt.py b/dhelp/files/txt.py new file mode 100644 index 0000000..d7cae74 --- /dev/null +++ b/dhelp/files/txt.py @@ -0,0 +1,168 @@ +#!/usr/bin/python + +from ._bases import BaseFile, BaseFolder + + +class TextFile(BaseFile): + """Load and save data quickly to path specified. + + Represents the plain text file at the path specified. Loads data + located at given path as a string. Likewise if .save() will save string + data at the system path send to TextFile. + + This object can be used by itself, or can be constructed automatically by + using TextFolder. + + Args: + path (:obj:`str`) System path pointing to desired text file location + + Attributes: + exists (:obj:`bool`) Whether or not a file exists at the location + size (:obj:`int`) Size of item(s) stored at current location + basename (:obj:`str`) Name of current file + dirname (:obj:`str`) Full path to file's parent directory + + Examples: + >>> from dhelp import TextFile + >>> TextFile('some/path.txt') + '/absolute/path/to/some/path.txt' + """ # noqa + + def load(self, *args, **kwargs): + """Opens the file data as a single string. + + Opens the file using 'utf-8' unless otherwise specified in options. + Returns data as a string unless 'readlines' option is specified, in + which case data is returned as a list of strings. + + Args: + options (:obj:`dict`, optional) Options settings found at respective keywords + + Raises: + Exception: If path does not point to a file + + Examples: + >>> TextFile('some/path.txt').load() + 'Lorem ipsum dolor sit amet...' + """ # noqa + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) + # print loading message if silent option not flagged + if not options['silent']: + print('Loading', self.data) + if not self.is_file: + raise Exception('Item is not a file') + file_data = '' + with open(self.data, 'r+', encoding=options['encoding']) as read_file: + # if option specified, return as list of text lines + if options['readlines']: + file_data = read_file.readlines() + # normally return entire data as single string + else: + file_data = read_file.read() + return file_data + + def save(self, data, *args, **kwargs): + """Saves string data to file. + + Receives string data and writes it to a file. If a list is received, + it rejoins the list with endlines before saving. If anything exists + at the current path, an exception will be raised unless the 'overwrite' + option it set. + + Args: + data (:obj:`str`) Data to be saved to file, must be a single string + options (:obj:`dict`, optional) Options settings found at respective keywords + + Examples: + >>> # saving to a new location + >>> TextFile('some/path.txt').save('Lorem ipsum dolor sit amet...') + '/absolute/path/to/some/path.txt' + + >>> # setting overwrite option + >>> options = {'overwrite': True} + >>> TextFile('some/path.txt').save('consectetur adipiscing elit', options=options) + '/absolute/path/to/some/path.txt' + """ # noqa + # get default options and update with any passed options + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) + # print saving message if silent option not flagged + if not options['silent']: + print('Saving to', self.data) + if self.exists and options['overwrite'] is not True: + raise Exception( + 'Item exists at ' + self.data + ' and overwrite not specified' + ) + # create all parent directories required for save + self.makedirs() + with open(self.data, 'w+', encoding=options['encoding']) as write_file: + write_file.write(data) + return True + + +class TextFolder(BaseFolder): + """ Load or save a folder of plaintext files as a list of strings. + + Object for interacting with a folder of plain text files. Allows quick + discovery of filepaths and construction of relevant TextFile objects. Also + enables batch editing of an entire directory by passing a callback. + + Args: + path (:obj:`str`) System path pointing to desired text folder location + + Examples: + >>> from dhelp import TextFolder + >>> TextFolder('some/path') + '/absolute/path/to/some/path' + """ + file_class = TextFile + + def modify(self, destination, modify_cb, *args, **kwargs): + """ Edit and save every file in the folder by passing a function. + + Opens every file and performs a callback function sent to it. Provides + a fast means of batch editing an entire folder of txt files. Returns + a new TextFolder linked with the modified copy. + + The callback function should have only one argument (e.g. record_data) + which represents the data of any given file, in string format (see + example below). Whatever the function returns is what will be + saved to the modified file, as long as it is a string. + + Args: + destination (:obj:`string`) System path where you want the altered folder to be saved + modifycb (:obj:`function`) User-defined function used to modify each record's data + options (:obj:`dict`, optional) Options settings found at respective keywords + + Returns: + :obj:`self.__class__` New TextFolder object tied to the modified folder + + Examples: + >>> # define a function which alters data as you wish + >>> def modify_record(record_data): + >>> record_data = record_data.replace('\\n', '') + >>> return record_data + + >>> # if you don't specify destination, a backup will be made + >>> options = {'destination': 'some/other-path'} + + >>> # use TextFolder().modify, pass your function as 1st arg + >>> TextFolder('some/path').modify(modify_record, options=options) + '/absolute/path/to/some/path' + """ # noqa + options = self.options + if 'options' in kwargs: + if type(kwargs['options']) == dict: + options.update(kwargs['options']) + modified_folder = self.copy(destination, options=options) + for item_file in modified_folder.files(): + item_data = modify_cb(item_file.load(options=options)) + item_file.save(item_data, options=options) + # return self upon success + return modified_folder diff --git a/dhelp/settings.py b/dhelp/settings.py new file mode 100644 index 0000000..7d88cce --- /dev/null +++ b/dhelp/settings.py @@ -0,0 +1,162 @@ +LANGUAGES = [ + 'english', + 'latin', + 'ancient greek', +] + +DELIMITERS = [ + ',', + ';', + '\t', +] + +NLTK_PACKAGES = { + 'english': [ + ('verbnet', ['corpora', 'verbnet.zip']), + ('wordnet', ['corpora', 'wordnet.zip']), + ('words', ['corpora', 'words.zip']), + ('large_grammars', ['grammars', 'large_grammars.zip']), + ( + 'averaged_perceptron_tagger', + ['taggers', 'averaged_perceptron_tagger.zip'] + ), + ( + 'maxent_treebank_pos_tagger', + ['taggers', 'maxent_treebank_pos_tagger.zip'] + ), + ('universal_tagset', ['taggers', 'universal_tagset.zip']), + ('punkt', ['tokenizers', 'punkt.zip']), + ('maxent_ne_chunker', ['chunkers', 'maxent_ne_chunker.zip']), + ] +} + +CLTK_PACKAGES = { + 'greek': [ + ('greek_software_tlgu', 'software/greek_software_tlgu'), + ('greek_proper_names_cltk', 'lexicon_greek_proper_names_cltk'), + ('greek_models_cltk', 'models/greek_models_cltk'), + ('greek_treebank_perseus', 'treebank/greek_treebank_perseus'), + ('greek_lexica_perseus', 'lexicon/greek_lexica_perseus'), + ( + 'greek_training_set_sentence_cltk', + 'training_set/greek_training_set_sentence_cltk' + ), + ('greek_word2vec_cltk', 'lexicon/greek_word2vec_cltk'), + ], + 'latin': [ + ('latin_treebank_perseus', 'treebank/latin_treebank_perseus'), + ('latin_proper_names_cltk', 'lexicon/latin_proper_names_cltk'), + ('latin_models_cltk', 'models/latin_models_cltk'), + ('latin_pos_lemmata_cltk', 'lemma/latin_pos_lemmata_cltk'), + ( + 'latin_treebank_index_thomisticus', + 'treebank/latin_treebank_index_thomisticus' + ), + ('latin_lexica_perseus', 'lexicon/latin_lexica_perseus'), + ( + 'latin_training_set_sentence_cltk', + 'training_set/latin_training_set_sentence_cltk' + ), + ('latin_word2vec_cltk', 'models/latin_word2vec_cltk'), + ] +} + +ENCODINGS = [ + ('ascii'), + ('big5'), + ('big5khscs'), + ('cp037'), + ('cp273'), + ('cp424'), + ('cp437'), + ('cp500'), + ('cp720'), + ('cp737'), + ('cp775'), + ('cp850'), + ('cp852'), + ('cp855'), + ('cp856'), + ('cp857'), + ('cp858'), + ('cp860'), + ('cp861'), + ('cp862'), + ('cp863'), + ('cp864'), + ('cp865'), + ('cp866'), + ('cp869'), + ('cp874'), + ('cp875'), + ('cp932'), + ('cp949'), + ('cp950'), + ('cp1006'), + ('cp1026'), + ('cp1125'), + ('cp1140'), + ('cp1250'), + ('cp1251'), + ('cp1252'), + ('cp1254'), + ('cp1255'), + ('cp1256'), + ('cp1257'), + ('cp1258'), + ('cp65001'), + ('euc_jp'), + ('euc_jis_2004'), + ('euc_jisx0213'), + ('euc_kr'), + ('gb2312'), + ('gbk'), + ('gb18030'), + ('hz'), + ('iso2022_jp'), + ('iso2022_jp_1'), + ('iso2022_jp_2'), + ('iso2022_jp_2004'), + ('iso2022_jp_3'), + ('iso2022_jp_exit'), + ('iso2022_kr'), + ('latin_1'), + ('iso8859_2'), + ('iso8859_3'), + ('iso8859_4'), + ('iso8859_5'), + ('iso8859_6'), + ('iso8859_7'), + ('iso8859_8'), + ('iso8859_9'), + ('iso8859_10'), + ('iso8859_11'), + ('iso8859_13'), + ('iso8859_14'), + ('iso8859_15'), + ('iso8859_16'), + ('johab'), + ('koi8_r'), + ('koi8_t'), + ('koi8_u'), + ('kz1048'), + ('mac_cyrillic'), + ('mac_greek'), + ('mac_iceland'), + ('mac_latin2'), + ('mac_roman'), + ('mac_turkish'), + ('ptcp154'), + ('shift_jis'), + ('shift_jis_2004'), + ('shift_jisx0213'), + ('utf_32'), + ('utf_32_be'), + ('utf_32_le'), + ('utf_16'), + ('utf_16_be'), + ('utf_16_le'), + ('utf_7'), + ('utf_8'), + ('utf_8_sig'), +] diff --git a/dhelp/tests/__init__.py b/dhelp/tests/__init__.py index e69de29..013e4b7 100644 --- a/dhelp/tests/__init__.py +++ b/dhelp/tests/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/python diff --git a/dhelp/tests/test_settings.py b/dhelp/tests/test_settings.py new file mode 100644 index 0000000..da6c2d6 --- /dev/null +++ b/dhelp/tests/test_settings.py @@ -0,0 +1,20 @@ +#!/usr/bin/python + +import unittest + +from .. import settings + + +class TestSettings(unittest.TestCase): + + def test_languages(self): + return self.assertTrue(settings.LANGUAGES) + + def test_delimiters(self): + return self.assertTrue(settings.DELIMITERS) + + def test_nltk_packages(self): + return self.assertTrue(settings.NLTK_PACKAGES) + + def test_encodings(self): + return self.assertTrue(settings.ENCODINGS) diff --git a/dhelp/tests/test_web_page.py b/dhelp/tests/test_web_page.py new file mode 100644 index 0000000..3f83e6b --- /dev/null +++ b/dhelp/tests/test_web_page.py @@ -0,0 +1,39 @@ +#!/usr/bin/python + +import unittest + +from bs4 import BeautifulSoup + +from ..web import WebPage + + +class TestWebPage(unittest.TestCase): + + def test_fetch(self): + # ensure request returns text data + page = WebPage('https://stackoverflow.com', options={'silent': True}) + return self.assertTrue(len(page.fetch()) > 0) + + def test_soup(self): + # ensure object is a BeautifulSoup type object + page = WebPage('https://stackoverflow.com', options={'silent': True}) + return self.assertTrue(type(page.soup()) == BeautifulSoup) + + def test_max_retries(self): + # should return none after hitting max_retries getting invalid page + page = WebPage( + 'http://0.0.0.0', + options={ + 'silent': True, + 'max_retries': 3 + } + ) + return self.assertEqual(page.fetch(), None) + + def test_context_manager(self): + # ensure soup works when invoked using with.. as.. context manager + results = None + page = WebPage('https://stackoverflow.com', options={'silent': True}) + with page as page_soup: + results = page_soup + return self.assertTrue((type(results)) == BeautifulSoup) diff --git a/dhelp/text/__init__.py b/dhelp/text/__init__.py index 013e4b7..dcd4e1e 100644 --- a/dhelp/text/__init__.py +++ b/dhelp/text/__init__.py @@ -1 +1,4 @@ #!/usr/bin/python + +from .nltk import EnglishText +from .cltk import LatinText, AncientGreekText diff --git a/dhelp/text/_bases.py b/dhelp/text/_bases.py new file mode 100644 index 0000000..117915c --- /dev/null +++ b/dhelp/text/_bases.py @@ -0,0 +1,178 @@ +#!/usr/bin/python + +import re +from collections import UserString + + +class BaseText(UserString): + """Performs text manipulation and natural language processing. + + Base class for all Text objects. Can be used on its own to perform a number + of operations, although it is best used with on of its language-specific + children. + + Args: + text (:obj:`str`) Text to be stored for processing/nlp + options (:obj:`dict`, optional) Options settings found at respective keywords + + Example: + >>> from dhelp import BaseText + >>> text = BaseText('Lorem ipsum dolor sit amet...') + >>> print(text) + 'Lorem ipsum dolor sit amet...' + """ # noqa + + def __init__(self, text, options={}): + super().__init__(str) + if 'encoding' not in options: + options['encoding'] = 'utf-8' + if 'language' not in options: + options['language'] = 'english' + self.data = text + self.options = options + + def __enter__(self): + pass + + def __exit__(self, ctx_type, ctx_value, ctx_traceback): + pass + + def stringify(self): + """Returns the text of this object as a pure string type. + + Can be useful when you need the text back in a string object format + for comparison with regular strings. + + Returns: + :obj:`str` String form of the text + + Example: + >>> text = BaseText('Lorem ipsum dolor sit amet...') + >>> stringified_text = text.stringify() + >>> print(type(stringified_text)) + + """ + return str(self.data) + + def rm_lines(self): + """Removes endlines. + + Gives a new version of the text with all endlines removed. Removes + any dashed line endings and rejoins split words. + + Returns: + :obj:`self.__class__` New version of text, with endlines removed + + Example: + >>> text = BaseText('Lorem\\nipsum do-\\nlor sit amet....\\n') + >>> modified_text = text.rm_lines() + >>> print(modified_text) + 'Lorem ipsum dolor sit amet...' + """ + rexr = re.compile(r'\n+') + # substituting single endlines for matching endline blocks + clean_text = rexr.sub(' ', self.data) + return self.__class__( + clean_text + .replace('-\n ', '').replace('- \n', '').replace('-\n', '') + .replace(' - ', '').replace('- ', '').replace(' -', '') + .replace('\n', ' '), + self.options + ) + + def rm_nonchars(self): + """Removes non-language characters. + + Gives a new version of the text with only latin characters remaining, + or Greek characters for Greek, texts, and so on. Defaults to assuming + Latin based. + + Returns: + :obj:`self.__class__` Returns new version of text, with non-letters removed + + Example: + >>> text = BaseText('1αLorem ipsum 2βdolor sit 3γamet...') + >>> modified_text = text.rm_nonchars() + >>> print(modified_text) + 'Lorem ipsum dolor sit amet...' + """ # noqa + if self.options['language'] == 'greek': + valid_chars_pattern = '([ʹ-Ϋά-ϡἀ-ᾯᾰ-῾ ])' + else: + valid_chars_pattern = '([A-Za-z ])' + return self.__class__( + "".join(re.findall(valid_chars_pattern, self.data)), + self.options + ) + + def rm_edits(self): + """Removes text inside editor's marks. + + Gives a new version with any text between editorial marks such as + brackets or parentheses removed. + + Returns: + :obj:`self.__class__` Returns new version of text, with editoria removed + + Example: + >>> text = BaseText('Lore[m i]psum {dolo}r sit a(met)...') + >>> modified_text = text.rm_edits() + >>> print(modified_text) + 'Lor psum r sit a...' + """ # noqa + return self.__class__( + re.sub("\〚(.*?)\〛", "", re.sub("\{(.*?)\}", "", re.sub( + "\((.*?)\)", "", re.sub("\<(.*?)\>", "", re.sub( + "\[(.*?)\]", "", self.data))))), + self.options + ) + + def rm_spaces(self): + """Removes extra whitespace. + + Gives a new version of the text with extra whitespace collapsed. + + Returns: + :obj:`self.__class__` Returns new version of text, with extra spaced collapsed + + Example: + >>> text = BaseText('Lorem ipsum dolor sit amet...') + >>> modified_text = text.rm_spaces() + >>> print(modified_text) + 'Lorem ipsum dolor sit amet...' + """ # noqa + # regex compiler for all whitespace blocks + rexr = re.compile(r'\s+') + # substituting single spaces for matching whitespace blocks + clean_text = rexr.sub(' ', self.data) + return self.__class__( + clean_text.strip(), + self.options + ) + + def re_search(self, pattern): + """Search text for matching pattern. + + Receives search pattern and returns True/False if it matches. Pattern + can be a simple string match (e.g. .re_search('does this match?')), or + a full Regular Expression. + + Args: + pattern (:obj:`str`) String with the desired Regular Expression to search + + Returns: + :obj:`bool` True if matching, False if not + + Example: + >>> text = BaseText('Lorem ipsum dolor sit amet...') + >>> print(text.re_search('Lorem ipsum')) + True + >>> print(text.re_search('Arma virumque cano')) + False + """ # noqa + # Converting pattern to regex + pattern = re.compile(pattern) + if pattern.search(self.data): + return True + else: + return False diff --git a/dhelp/text/_bases_mixins.py b/dhelp/text/_bases_mixins.py deleted file mode 100644 index d5d0823..0000000 --- a/dhelp/text/_bases_mixins.py +++ /dev/null @@ -1,635 +0,0 @@ -#!/usr/bin/python - -import pip -import re -from collections import UserString - -import nltk -from nltk.text import Text -from nltk.tokenize.punkt import PunktLanguageVars -from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize -from nltk.util import ngrams, bigrams, trigrams, skipgrams -from nltk.stem.wordnet import WordNetLemmatizer -from nltk import pos_tag - - -class BaseText(UserString): - """Performs text manipulation and natural language processing. - - Base class for all Text objects. Can be used on its own to perform a number - of operations, although it is best used with on of its language-specific - children. - - Args: - text (:obj:`str`) Text to be stored for processing/nlp - options (:obj:`dict`, optional) Options settings found at respective keywords - - Example: - >>> from dhelp import BaseText - >>> text = BaseText('Lorem ipsum dolor sit amet...') - >>> print(text) - 'Lorem ipsum dolor sit amet...' - """ # noqa - - def __init__(self, text, options={}): - super().__init__(str) - if 'encoding' not in options: - options['encoding'] = 'utf-8' - if 'language' not in options: - options['language'] = 'english' - self.data = text - self.options = options - - def stringify(self): - """Returns the text of this object as a pure string type. - - Can be useful when you need the text back in a string object format - for comparison with regular strings. - - Returns: - :obj:`str` String form of the text - - Example: - >>> text = BaseText('Lorem ipsum dolor sit amet...') - >>> stringified_text = text.stringify() - >>> print(type(stringified_text)) - - """ - return str(self.data) - - def rm_lines(self): - """Removes endlines. - - Gives a new version of the text with all endlines removed. Removes - any dashed line endings and rejoins split words. - - Returns: - :obj:`self.__class__` New version of text, with endlines removed - - Example: - >>> text = BaseText('Lorem\\nipsum do-\\nlor sit amet....\\n') - >>> modified_text = text.rm_lines() - >>> print(modified_text) - 'Lorem ipsum dolor sit amet...' - """ - rexr = re.compile(r'\n+') - # substituting single endlines for matching endline blocks - clean_text = rexr.sub(' ', self.data) - return self.__class__( - clean_text - .replace('-\n ', '').replace('- \n', '').replace('-\n', '') - .replace(' - ', '').replace('- ', '').replace(' -', '') - .replace('\n', ' '), - self.options - ) - - def rm_nonchars(self): - """Removes non-language characters. - - Gives a new version of the text with only latin characters remaining, - or Greek characters for Greek, texts, and so on. Defaults to assuming - Latin based. - - Returns: - :obj:`self.__class__` Returns new version of text, with non-letters removed - - Example: - >>> text = BaseText('1αLorem ipsum 2βdolor sit 3γamet...') - >>> modified_text = text.rm_nonchars() - >>> print(modified_text) - 'Lorem ipsum dolor sit amet...' - """ # noqa - if self.options['language'] == 'greek': - valid_chars_pattern = '([ʹ-Ϋά-ϡἀ-ᾯᾰ-῾ ])' - else: - valid_chars_pattern = '([A-Za-z ])' - return self.__class__( - "".join(re.findall(valid_chars_pattern, self.data)), - self.options - ) - - def rm_edits(self): - """Removes text inside editor's marks. - - Gives a new version with any text between editorial marks such as - brackets or parentheses removed. - - Returns: - :obj:`self.__class__` Returns new version of text, with editoria removed - - Example: - >>> text = BaseText('Lore[m i]psum {dolo}r sit a(met)...') - >>> modified_text = text.rm_edits() - >>> print(modified_text) - 'Lor psum r sit a...' - """ # noqa - return self.__class__( - re.sub("\〚(.*?)\〛", "", re.sub("\{(.*?)\}", "", re.sub( - "\((.*?)\)", "", re.sub("\<(.*?)\>", "", re.sub( - "\[(.*?)\]", "", self.data))))), - self.options - ) - - def rm_spaces(self): - """Removes extra whitespace. - - Gives a new version of the text with extra whitespace collapsed. - - Returns: - :obj:`self.__class__` Returns new version of text, with extra spaced collapsed - - Example: - >>> text = BaseText('Lorem ipsum dolor sit amet...') - >>> modified_text = text.rm_spaces() - >>> print(modified_text) - 'Lorem ipsum dolor sit amet...' - """ # noqa - # regex compiler for all whitespace blocks - rexr = re.compile(r'\s+') - # substituting single spaces for matching whitespace blocks - clean_text = rexr.sub(' ', self.data) - return self.__class__( - clean_text.strip(), - self.options - ) - - def re_search(self, pattern): - """Search text for matching pattern. - - Receives search pattern and returns True/False if it matches. Pattern - can be a simple string match (e.g. .re_search('does this match?')), or - a full Regular Expression. - - Args: - pattern (:obj:`str`) String with the desired Regular Expression to search - - Returns: - :obj:`bool` True if matching, False if not - - Example: - >>> text = BaseText('Lorem ipsum dolor sit amet...') - >>> print(text.re_search('Lorem ipsum')) - True - >>> print(text.re_search('Arma virumque cano')) - False - """ # noqa - # Converting pattern to regex - pattern = re.compile(pattern) - if pattern.search(self.data): - return True - else: - return False - - -class NLTKMixin: - """Mixin for NLTK-related functions. - - Mixin class which provides access to NLTK-specific functions. This class - should be mixed with some base class (e.g. EnglishText) to give it nlp - related functions. - - Example: - >>> class EnglishText(NLTKTextMixin, EnglishText): - """ - - def setup(self): - """Download NLTK packages and trainer corpora. - - Launches the NLTK package download interface. Overridden by the CLTK - child classes to launch the automated CLTK downloader. Convenience - method if user has not already downloaded NLTK packages and trainer - sets. - - Example: - >>> EnglishText('').setup() - """ - nltk.download('punkt') - nltk.download('wordnet') - nltk.download('words') - nltk.download('large_grammars') - nltk.download('averaged_perceptron_tagger') - nltk.download('hmm_treebank_pos_tagger') - nltk.download('maxent_treebank_pos_tagger') - nltk.download('universal_tagset') - nltk.download('maxent_ne_chunker') - return True - - def rm_stopwords(self, stoplist=[]): - """Removes words or phrases from the text. - - Given a list of words or phrases, gives new text with those phrases - removed. - - Args: - stoplist (:obj:`list`) List of words or phrases to filter from text - - Returns: - :obj:`self.__class__` New version of text, with stop words/phrases removed - - Example: - >>> stopwords = ['ipsum', 'sit'] - >>> text = EnglishText('Lorem ipsum dolor sit amet...') - >>> text.rm_stopwords(stoplist=stopwords) - >>> print(modified_text) - 'Lorem dolor amet...' - """ # noqa - filtered_words = [] - # converts text to list of words with NLTK tokenizer - tokenizer = PunktLanguageVars() - tokens = tokenizer.word_tokenize(str(self.data)) - # loop through each word, if not in stoplist, append - for word in tokens: - not_found = True - for stopword in stoplist: - if str(word).strip().lower() == str(stopword).strip().lower(): - not_found = False - if not_found: - filtered_words.append(word) - # return rejoined word - return self.__class__( - " ".join(filtered_words), - self.options - ) - - def lemmatize(self): - """Transforms words into their lemmata. - - Gives a new version of the text in which every word is lemmatized. All - verbs are transformed into the first person singular present active, - all nouns are transformed into the singular masculine nominative, et.c. - - Returns: - :obj:`self.__class__` New version of the text with tokens transformed to their lemmata - - Example: - >>> text = EnglishText('The quick brown fox jumped over the lazy dog.') - >>> print(text.lemmatize()) - 'The quick brown fox jump over the lazy dog .' - """ # noqa - tagged_words = self.tag() - lemmata = [] - lemmatizer = WordNetLemmatizer() - for word, parsing in tagged_words: - # Grab main part of speech from first character in POS - pos = parsing[0] - try: - lemmatized_word = lemmatizer.lemmatize( - word.lower(), pos=pos.lower()[0] - ) - except: - lemmatized_word = word - lemmata.append(lemmatized_word) - return self.__class__( - " ".join(lemmata), - self.options - ) - - def tokenize(self, mode='word'): - """ Splits words (or sentences) into lists of strings - - Returns a tokenized list. By default returns list of words, but can - also return as a list of sentences. - - Args: - mode (:obj:`str`) Specifies tokenize mode, either 'word', 'sentence', or 'wordpunct' - - Returns: - :obj:`list` List of (string) tokens - - Example: - >>> text = EnglishText('Lorem ipsum dolor sit amet. Consectetur adipiscing elit.') # noqa - >>> print(EnglishText.tokenize()) - ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', '.', 'Consectetur', 'adipiscing', 'elit', '.'] - >>> print(EnglishText.tokenize(mode='sentence')) - ['Lorem ipsum dolor sit amet.', 'Consectetur adipiscing elit.'] - """ # noqa - if mode == 'sentence': - return ( - sent_tokenize(self.data) - ) - elif mode == 'wordpunct': - return wordpunct_tokenize(self.data) - else: - return word_tokenize(self.data) - - def tag(self): - """Performs part-of-speech analysis on the text. - - Returns list of words marked up with parts of speech. Each word is - returned as a 2-tuple, the first containing the word, the second with - the parts of speech. - - Returns: - (:obj:`list`) Words tagged as 2-tuples (word|part of speech) - - Example: - >>> text = EnglishText('They hated to think of sample sentences.') - >>> basic_tags = text.tag() - >>> print(basic_tags) - [('They', 'PRP'), ('hated', 'VBD'), ('to', 'TO'), ('think', 'VB'), ('of', 'IN'), ('sample', 'JJ'), ('sentences', 'NNS'), ('.', '.')] - """ # noqa - word_list = list(self.tokenize()) - return pos_tag(word_list) - - def ngrams(self, gram_size=3): - """Gives ngrams. - - Returns a list of ngrams, each ngram represented as a tuple. - - Args: - gram_size (:obj:`int`, optional) Size of the ngrams to generate - - Returns: - :obj:`list` of :obj:`tuple` Words of each ngram - - Example: - >>> text = EnglishText('They hated to think of sample sentences.') - >>> basic_ngrams = text.ngrams() - >>> print(basic_ngrams) - [('They', 'hated', 'to'), ('hated', 'to', 'think'), ('to', 'think', 'of'), ('think', 'of', 'sample'), ('of', 'sample', 'sentences'), ('sample', 'sentences', '.')] - """ # noqa - tokens = self.tokenize() - if gram_size < 2: # pragma: no cover - gram_size = 2 - if gram_size == 2: # pragma: no cover - return list(bigrams(tokens)) - if gram_size == 3: - return list(trigrams(tokens)) - else: # pragma: no cover - return list(ngrams(tokens, gram_size)) - - def skipgrams(self, gram_size=3, skip_size=1): - """Gives skipgrams. - - Returns list of skipgrams, similar to ngram, but allows spacing between - tokens. - - Args: - gram_size (:obj:`int`, optional) Size of the ngrams to generate - skip_size (:obj:`int`, optional) Size of max spacing allowed - - Returns: - :obj:`list` of :obj:`tuple` Words of each skipgram - - Example: - >>> text = EnglishText('They hated to think of sample sentences.') - >>> basic_skipgrams = text.skipgrams() - >>> print(basic_skipgrams) - [('They', 'hated', 'to'), ('They', 'hated', 'think'), ('They', 'to', 'think'), ('hated', 'to', 'think'), ('hated', 'to', 'of'), ('hated', 'think', 'of'), ('to', 'think', 'of'), ('to', 'think', 'sample'), ('to', 'of', 'sample'), ('think', 'of', 'sample'), ('think', 'of', 'sentences'), ('think', 'sample', 'sentences'), ('of', 'sample', 'sentences'), ('of', 'sample', '.'), ('of', 'sentences', '.'), ('sample', 'sentences', '.')] # noqa - """ - tokens = self.tokenize() - return list(skipgrams(tokens, gram_size, skip_size)) - - def word_count(self, word=None): - """Returns counter dictionary with word counts at respective keywords. - - Performs word counts and then stores their values in the respective - keyword of a counter dictionary. If a word is passed, a simple integer - count of the number of appearances is returned. - - Args: - word (:obj:`string`, optional) A single word you want to count - - Returns: - :obj:`dict` A dictionary with word counts stored in respective keywords - - Example: - >>> # TODO: - """ # noqa - counts = dict(Text(self.tokenize()).vocab()) - # If a single word was specified, only return that frequency - if word: - return counts[word] - return counts - - -class CLTKMixin(NLTKMixin): - """Mixin for CLTK-related functions. - - Parent class for Latin, Classical Greek, and other CLTK language-specific - objects. Provides access to universal CLTK commands with child classes - adding some methods and overriding others. - """ - - def setup(self): - """Download CLTK packages and trainer corpora. - - Launches the CLTK package download interface. Overridden by the CLTK - child classes to launch the automated CLTK downloader. Convenience - method if user has not already downloaded CLTK packages and trainer - sets. - - Example: - >>> LatinText('').setup() - """ - # first, download the cltk module from pip - pip.main(['install', 'cltk']) - # import cltk inline as global import errors for non-cltk users - from cltk.corpus.utils.importer import CorpusImporter - corpus_importer = CorpusImporter(self.options['language']) - # loop through and attempt to download, skip any errors - for cltk_corpus in corpus_importer.list_corpora: - print('Downloading', cltk_corpus) - try: - corpus_importer.import_corpus(cltk_corpus) - except: - print('Problem downloading', cltk_corpus, '(skipping)') - print('Finished downloading corpora') - return True - - def tokenize(self, mode='word'): - """Tokenizes the passage into lists of words or sentences. - - Breaks text words into individual tokens (strings) by default. If - mode is set to sentence, returns lists of sentences. - - Args: - mode (:obj:`str`) Mode of tokenization, either 'word' or 'sentence' - - Returns: - :obj:`list` of :obj:`str` Tokenized words (or sentences) - - Example: - >>> LatinText('Gallia est omnis divisa in partes tres').tokenize() - ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres'] - - """ - from cltk.tokenize.word import nltk_tokenize_words - from cltk.tokenize.sentence import TokenizeSentence - if mode == 'sentence': - return TokenizeSentence( - self.options['language'] - ).tokenize_sentences(self.data) - else: - return nltk_tokenize_words(self.data) - - def lemmatize(self, return_string=True, return_raw=False): - """Transforms words into their lemmata. - - Gives a new version of the text in which every word is lemmatized. All - verbs are transformed into the first person singular present active, - all nouns are transformed into the singular masculine nominative, et.c. - - Returns: - :obj:`self.__class__` New version of the text with tokens transformed to their lemmata - - Example: - >>> text = LatinText('Gallia est omnis divisa in partes tres') - >>> print(text.lemmatize()) - gallia edo1 omne divido in pars tres - """ # noqa - from cltk.stem.lemma import LemmaReplacer - return self.__class__( - text=LemmaReplacer( - self.options['language'] - ).lemmatize( - self.data.lower(), - return_string=return_string, - return_raw=return_raw - ), - options=self.options - ) - - # TODO: This function does not work for Greek currently - def scansion(self): - """Gives list of scanned feet. - - Returns list of strings, each string representing the beats of a given - foot. As in standard notation, dactyls are marked as '¯' and spondee's - as '˘'. - - Returns: - :obj:`list` Scanned feet - - Example: - >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris') - >>> print(text.scansion()) - ['¯˘˘¯˘˘˘˘˘¯˘˘˘˘˘x'] - """ # noqa - if self.options['language'] == 'greek': - from cltk.prosody.greek.scanner import Scansion as GreekScansion - return GreekScansion().scan_text(self.data) - elif self.options['language'] == 'latin': - from cltk.prosody.latin.scanner import Scansion as LatinScansion - return LatinScansion().scan_text(self.data) - - def entities(self, lemmatize=False, unique=False): - """Returns a list of entities recognized in the text. - - Uses cltk's built in named-entity recognition. Reorganizes cltk's raw - output from list of tuples to list of strings. Every entity recognized - is added to the list returned. Unless unique option is set, entities - which appear multiple times will be returned multiple times in the - list. - - Args: - lemmatize (:obj:`bool`, optional) Set True to lemmatize text before searching for entities - unique (:obj:`bool`, optional) Set True and no entity appears in the return list more than once - Example: - >>> text = LatinText('Gallia est omnis divisa in partes tres') - >>> print(text.entities()) - ['Gallia'] - """ # noqa - from cltk.stem.lemma import LemmaReplacer - from cltk.tag import ner - entity_list = [] - # filtering non-entities - for result in ner.tag_ner( - self.options['language'], - input_text=self.data, - output_type=list - ): - # appending if item flagged as entity in tuple[1] - try: - if result[1] == 'Entity': - entity_list.append(result[0]) - # do nothing if 'Entity' not specified - except: - pass - # removing duplicate entities if unique option specified - if unique: - entity_list = list(set(entity_list)) - # lemmatizing entities if option has been specified - if lemmatize: - entity_list = LemmaReplacer(self.options['language']).lemmatize( - entity_list, - return_string=False, - return_raw=False - ) - return entity_list - - # currently not working, TODO: fix or remove this code - # def compare_levenshtein(self, other_text): - # """Gives the levenshtein difference between this and any passed text. - # - # Args: - # other_text (:obj:`str`) String for comparison - # - # Returns: - # :obj:`float` Levenshtein difference between texts - # - # Example: - # >>> # TODO: - # - # """ # noqa - # from cltk.text_reuse.levenshtein import Levenshtein - # return Levenshtein().ratio(self.data, other_text) - - def compare_longest_common_substring(self, other_text): - """Gives the longest excerpt that this and any passed text have in common. - - Args: - other_text (:obj:`str`) String for comparison - - Returns: - :obj:`str` Longest common substring - - Example: - >>> text = LatinText('Gallia est omnis divisa in partes tres') - >>> print(text.compare_longest_common_substring('Galliae sunt omnis divisae in partes tres')) - in partes tres - """ # noqa - from cltk.text_reuse.comparison import long_substring - return long_substring(self.data, other_text) - - def compare_minhash(self, other_text): - """Gives the minimum hash between this and any passed text. - - Args: - other_text (:obj:`str`) String for comparison - - Returns: - :obj:`float` Minimum hash between texts - - Example: - >>> text = LatinText('Gallia est omnis divisa in partes tres') - >>> print(text.compare_minhash('Galliae sunt omnis divisae in partes tres')) - 0.6444444444444445 - """ # noqa - from cltk.text_reuse.comparison import minhash - return minhash(self.data, other_text) - - def word_count(self, word=None): - """Returns counter dictionary with word counts at respective keywords. - - Performs word counts and then stores their values in the respective - keyword of a counter dictionary. If a word is passed, a simple integer - count of the number of appearances is returned. - - Args: - word (:obj:`string`, optional) A single word you want to count - - Returns: - :obj:`dict` A dictionary with word counts stored in respective keywords - - Example: - >>> text = LatinText('Gallia est omnis divisa in partes tres tres tres') - >>> print(text.word_count(word='tres')) - 3 - """ # noqa - from cltk.utils.frequency import Frequency - counts = Frequency().counter_from_str(self.data) - # If a single word was specified, only return that frequency - if word: - return counts[word] - return counts diff --git a/dhelp/text/ancient_greek.py b/dhelp/text/ancient_greek.py deleted file mode 100644 index f452587..0000000 --- a/dhelp/text/ancient_greek.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/python - -from ._bases_mixins import BaseText, CLTKMixin - - -class AncientGreekText(CLTKMixin, BaseText): - """Main class to interact with Classical Greek-language texts. - - Provides Classical Greek-specific CLTK functions for text passed upon - construction. Most methods return a new version of the text, except those - that give non-text results (e.g. pos tagging) - - Example: - >>> from dhelp import AncientGreekText - >>> text = AncientGreekText('ἔστι δὲ σύμπαντα ταῦτα τὰ συγγράμματα ἐκείνῃ μάλιστα οὐκ ὠφέλιμα, ὅτι ὡς πρὸς εἰδότας συγγέγραπται.') - >>> print(text.lemmatize()) - εἰμί δὲ σύμπας οὗτος τὰ σύγγραμμα ἐκεῖνος μάλιστα οὐ ὠφέλιμος , ὅστις ὡς πρὸς οἶδα συγγράφω. - """ # noqa - - def __init__(self, text, options={}): - options['language'] = 'greek' - super().__init__(text=text, options=options) - - def normalize(self): - """Fixes problems with differences in greek accent encoding. - - Certain Greek accents have more than one possible encoding. Uses cltk's - built-in normalizer to correct the character encoding differences and - ensure that accents are encoded the same way. - - Returns: - :obj:`self.__class__` New instance with altered text - - Example: - >>> text = AncientGreekText('ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι') - >>> print(text.normalize()) - ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι - """ # noqa - from cltk.corpus.utils.formatter import cltk_normalize - return self.__class__( - text=cltk_normalize(str(self.data)), - options=self.options - ) - - def tlgu_cleanup(self, rm_punctuation=True, rm_periods=False): - """Fix TLG betacode texts using TLGU. - - Necessary to cleanup TLG texts before processing, but can also used to - perform rudimentary cleaning operations on other Greek texts. - - Args: - rm_punctuation (:obj:`bool`, optional) True to remove punctuation marks (exception periods) - rm_periods (:obj:`bool`, optional) True to remove periods - - Returns: - :obj:`self.__class__` New instance with altered text - - Example: - >>> text = AncientGreekText('ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι') - >>> print(text.tlgu_cleanup()) - ῖν εἰς δὲ τὸν ἕτερον καττίτερον εἰ λῶιον καὶ ἄμεινόν ἐστι - """ # noqa - from cltk.corpus.utils.formatter import tlg_plaintext_cleanup - return self.__class__( - text=tlg_plaintext_cleanup( - self.data, rm_punctuation=rm_punctuation, rm_periods=rm_periods - ), - options=self.options - ) - - def tag(self, mode='123'): - """Gives words marked up with parts-of-speech. - - Override's the cltk POS tagger and uses cltk's instead. Has different - methods for providing a POS tagger, if desired. - - Args: - mode (:obj:`str`) Tagging mode, either '123', or 'tnt' - - Returns: - :obj:`list` of :obj:`tuple` 2-tuples with word, part-of-speech - - Example: - >>> text = AncientGreekText('ἔστι δὲ σύμπαντα ταῦτα τὰ συγγράμματα ἐκείνῃ μάλιστα οὐκ ὠφέλιμα, ὅτι ὡς πρὸς εἰδότας συγγέγραπται.') - >>> print(text.tag()) - [('ἔστι', 'V3SPIA---'), ('δὲ', 'G--------'), ('σύμπαντα', None), ('ταῦτα', 'A-P---NA-'), ('τὰ', 'L-P---NA-'), ('συγγράμματα', None), ('ἐκείνῃ', 'A-S---FD-'), ('μάλιστα', 'D--------'), ('οὐκ', 'D--------'), ('ὠφέλιμα', None), (',', 'U--------'), ('ὅτι', 'C--------'), ('ὡς', 'C--------'), ('πρὸς', 'R--------'), ('εἰδότας', 'T-PRPAMA-'), ('συγγέγραπται', None), ('.', '---------')] - """ # noqa - from cltk.tag.pos import POSTag - tagger = POSTag(self.options['language']) - mode = mode.lower() - if mode != '123' and mode != 'tnt': - raise Exception( - 'Invalid part of speech tagging mode specified.' - ) - elif mode == '123': - return tagger.tag_ngram_123_backoff(self.data) - elif mode == 'tnt': - return tagger.tag_tnt(self.data) diff --git a/dhelp/text/cltk.py b/dhelp/text/cltk.py new file mode 100644 index 0000000..db0de7a --- /dev/null +++ b/dhelp/text/cltk.py @@ -0,0 +1,448 @@ +#!/usr/bin/python + +import importlib +import pip + +from ._bases import BaseText +from .nltk import NLTKMixin + + +class CLTKMixin(NLTKMixin): + """Mixin for CLTK-related functions. + + Parent class for Latin, Classical Greek, and other CLTK language-specific + objects. Provides access to universal CLTK commands with child classes + adding some methods and overriding others. + """ + + def setup(self): + """Download CLTK packages and trainer corpora. + + Launches the CLTK package download interface. Overridden by the CLTK + child classes to launch the automated CLTK downloader. Convenience + method if user has not already downloaded CLTK packages and trainer + sets. + + Example: + >>> LatinText('').setup() + """ + # check if cltk is already installed, if not, install it + if not importlib.find_loader('cltk'): + pip.main(['install', 'cltk']) + # include cltk inline + from cltk.corpus.utils.importer import CorpusImporter + setup_language = self.options['language'] + # for ancient greek, change to 'greek' for purposes of cltk setup + if setup_language == 'ancient greek': + setup_language = 'greek' + corpus_importer = CorpusImporter(setup_language) + # loop through, check if extant, attempt to download, skip any errors + for cltk_corpus in corpus_importer.list_corpora: + print('Downloading', cltk_corpus) + try: + corpus_importer.import_corpus(cltk_corpus) + except: + print('Problem downloading', cltk_corpus, '(skipping)') + return True + + def tokenize(self, mode='word'): + """Tokenizes the passage into lists of words or sentences. + + Breaks text words into individual tokens (strings) by default. If + mode is set to sentence, returns lists of sentences. + + Args: + mode (:obj:`str`) Mode of tokenization, either 'word' or 'sentence' + + Returns: + :obj:`list` of :obj:`str` Tokenized words (or sentences) + + Example: + >>> LatinText('Gallia est omnis divisa in partes tres').tokenize() + ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres'] + + """ + from cltk.tokenize.word import nltk_tokenize_words + from cltk.tokenize.sentence import TokenizeSentence + if mode == 'sentence': + return TokenizeSentence( + self.options['language'] + ).tokenize_sentences(self.data) + else: + return nltk_tokenize_words(self.data) + + def lemmatize(self, return_string=True, return_raw=False): + """Transforms words into their lemmata. + + Gives a new version of the text in which every word is lemmatized. All + verbs are transformed into the first person singular present active, + all nouns are transformed into the singular masculine nominative, et.c. + + Returns: + :obj:`self.__class__` New version of the text with tokens transformed to their lemmata + + Example: + >>> text = LatinText('Gallia est omnis divisa in partes tres') + >>> print(text.lemmatize()) + gallia edo1 omne divido in pars tres + """ # noqa + from cltk.stem.lemma import LemmaReplacer + return self.__class__( + text=LemmaReplacer( + self.options['language'] + ).lemmatize( + self.data.lower(), + return_string=return_string, + return_raw=return_raw + ), + options=self.options + ) + + # TODO: This function does not work for Greek currently + def scansion(self): + """Gives list of scanned feet. + + Returns list of strings, each string representing the beats of a given + foot. As in standard notation, dactyls are marked as '¯' and spondee's + as '˘'. + + Returns: + :obj:`list` Scanned feet + + Example: + >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris') + >>> print(text.scansion()) + ['¯˘˘¯˘˘˘˘˘¯˘˘˘˘˘x'] + """ # noqa + if self.options['language'] == 'greek': + from cltk.prosody.greek.scanner import Scansion as GreekScansion + return GreekScansion().scan_text(self.data) + elif self.options['language'] == 'latin': + from cltk.prosody.latin.scanner import Scansion as LatinScansion + return LatinScansion().scan_text(self.data) + + def entities(self, lemmatize=False, unique=False): + """Returns a list of entities recognized in the text. + + Uses cltk's built in named-entity recognition. Reorganizes cltk's raw + output from list of tuples to list of strings. Every entity recognized + is added to the list returned. Unless unique option is set, entities + which appear multiple times will be returned multiple times in the + list. + + Args: + lemmatize (:obj:`bool`, optional) Set True to lemmatize text before searching for entities + unique (:obj:`bool`, optional) Set True and no entity appears in the return list more than once + Example: + >>> text = LatinText('Gallia est omnis divisa in partes tres') + >>> print(text.entities()) + ['Gallia'] + """ # noqa + from cltk.stem.lemma import LemmaReplacer + from cltk.tag import ner + entity_list = [] + # filtering non-entities + for result in ner.tag_ner( + self.options['language'], + input_text=self.data, + output_type=list + ): + # appending if item flagged as entity in tuple[1] + try: + if result[1] == 'Entity': + entity_list.append(result[0]) + # do nothing if 'Entity' not specified + except: + pass + # removing duplicate entities if unique option specified + if unique: + entity_list = list(set(entity_list)) + # lemmatizing entities if option has been specified + if lemmatize: + entity_list = LemmaReplacer(self.options['language']).lemmatize( + entity_list, + return_string=False, + return_raw=False + ) + return entity_list + + # currently not working, TODO: fix or remove this code + # def compare_levenshtein(self, other_text): + # """Gives the levenshtein difference between this and any passed text. + # + # Args: + # other_text (:obj:`str`) String for comparison + # + # Returns: + # :obj:`float` Levenshtein difference between texts + # + # Example: + # >>> # TODO: + # + # """ # noqa + # from cltk.text_reuse.levenshtein import Levenshtein + # return Levenshtein().ratio(self.data, other_text) + + def compare_longest_common_substring(self, other_text): + """Gives the longest excerpt that this and any passed text have in common. + + Args: + other_text (:obj:`str`) String for comparison + + Returns: + :obj:`str` Longest common substring + + Example: + >>> text = LatinText('Gallia est omnis divisa in partes tres') + >>> print(text.compare_longest_common_substring('Galliae sunt omnis divisae in partes tres')) + in partes tres + """ # noqa + from cltk.text_reuse.comparison import long_substring + return long_substring(self.data, other_text) + + def compare_minhash(self, other_text): + """Gives the minimum hash between this and any passed text. + + Args: + other_text (:obj:`str`) String for comparison + + Returns: + :obj:`float` Minimum hash between texts + + Example: + >>> text = LatinText('Gallia est omnis divisa in partes tres') + >>> print(text.compare_minhash('Galliae sunt omnis divisae in partes tres')) + 0.6444444444444445 + """ # noqa + from cltk.text_reuse.comparison import minhash + return minhash(self.data, other_text) + + def word_count(self, word=None): + """Returns counter dictionary with word counts at respective keywords. + + Performs word counts and then stores their values in the respective + keyword of a counter dictionary. If a word is passed, a simple integer + count of the number of appearances is returned. + + Args: + word (:obj:`string`, optional) A single word you want to count + + Returns: + :obj:`dict` A dictionary with word counts stored in respective keywords + + Example: + >>> text = LatinText('Gallia est omnis divisa in partes tres tres tres') + >>> print(text.word_count(word='tres')) + 3 + """ # noqa + from cltk.utils.frequency import Frequency + counts = Frequency().counter_from_str(self.data) + # If a single word was specified, only return that frequency + if word: + return counts[word] + return counts + + +class LatinText(CLTKMixin, BaseText): + """Main class to interact with Latin-language texts. + + Provides Latin-specific CLTK functions for text passed upon construction. + Most methods return a new version of the text, except those that give + non-text results (e.g. pos tagging) + + Example: + >>> from dhelp import LatinText + >>> text = LatinText('Gallia est omnis divisa in partes tres') + >>> print(text.lemmatize()) + gallia edo1 omne divido in pars tres + """ + + def __init__(self, text, options={}): + options['language'] = 'latin' + super().__init__(text=text, options=options) + + def macronize(self, mode='tag_ngram_123_backoff'): + """Adds macrons (long vowel marks). + + Macrons distinguish long vowels from short. Distinguishing them is + critical for the study of Latin poetry and occasionally is important + in prose. Note that once you add macrons, long vowels are, for all + intents and purposes, different letters than their short equivalents. + + Args: + mode (:obj:`str`, optional) POS tagging method to use, 'tag_ngram_123_backoff', 'tag_tnt', or 'tag_crf' + + Returns: + :obj:`self.__class__` New text with macrons added to long vowels + + Example: + >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris') + >>> print(text.macronize()) + arma virumque cano , trojae quī prīmus ab ōrīs + """ # noqa + from cltk.prosody.latin.macronizer import Macronizer + mode = mode.lower() + if ( + mode != 'tag_ngram_123_backoff' and + mode != 'tag_tnt' and + mode != 'tag_crf' + ): + return False + return self.__class__( + Macronizer(tagger=mode).macronize_text(self.data), + self.options + ) + + def normalize(self): + """Replaces 'j's with 'i's and 'v's with 'u's. + + Ancient texts did not use j's or 'v's (viz. Indiana Jones and the Last + Crusade), but their usage in modern texts can throw off word counts, + pattern mataching, and general text-analysis methods. This method + converts these letters to their ancient versions. + + Returns: + :obj:`self.__class__` New text with macrons added to long vowels + + Example: + >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris') + >>> print(text.normalize()) + Arma uirumque cano, Troiae qui primus ab oris + """ # noqa + from cltk.stem.latin.j_v import JVReplacer + return self.__class__( + JVReplacer().replace(self.data), + self.options + ) + + def stemmify(self): + """Returns text with only stems. + + An alternate method to lemmatization. Instead of converting to lemmata + (principi -> princeps) converts to stemma (principi -> princp) + + Returns: + :obj:`self.__class__` New text with stemma + + Example: + >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris') + >>> print(text.stemmify()) + arm vir cano, troi qui prim ab or + """ # noqa + from cltk.stem.latin.stem import Stemmer + return self.__class__( + Stemmer().stem(self.data.lower()), + self.options + ) + + def clausulae(self): + """Counts different kinds of prose clausulae. + + Examines prose for evidence for poetic rythms (clausulae). Returns a + keyword/value dict with total counts for each kind of clausula. + + Returns: + :obj:`list` of `str` Individual clausulae results + + Example: + >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris') + >>> print(text.clausulae()) + {'cretic + trochee': 0, '4th paeon + trochee': 0, '1st paeon + trochee': 0, 'substituted cretic + trochee': 0, '1st paeon + anapest': 0, 'double cretic': 0, '4th paeon + cretic': 0, 'molossus + cretic': 0, 'double trochee': 0, 'molossus + double trochee': 0, 'cretic + double trochee': 0, 'dactyl + double trochee': 0, 'choriamb + double trochee': 0, 'cretic + iamb': 0, 'molossus + iamb': 0, 'double spondee': 0, 'cretic + double spondee': 0, 'heroic': 0} + """ # noqa + from cltk.prosody.latin.clausulae_analysis import Clausulae + return Clausulae().clausulae_analysis(self.data) + + +class AncientGreekText(CLTKMixin, BaseText): + """Main class to interact with Classical Greek-language texts. + + Provides Classical Greek-specific CLTK functions for text passed upon + construction. Most methods return a new version of the text, except those + that give non-text results (e.g. pos tagging) + + Example: + >>> from dhelp import AncientGreekText + >>> text = AncientGreekText('ἔστι δὲ σύμπαντα ταῦτα τὰ συγγράμματα ἐκείνῃ μάλιστα οὐκ ὠφέλιμα, ὅτι ὡς πρὸς εἰδότας συγγέγραπται.') + >>> print(text.lemmatize()) + εἰμί δὲ σύμπας οὗτος τὰ σύγγραμμα ἐκεῖνος μάλιστα οὐ ὠφέλιμος , ὅστις ὡς πρὸς οἶδα συγγράφω. + """ # noqa + + def __init__(self, text, options={}): + options['language'] = 'greek' + super().__init__(text=text, options=options) + + def normalize(self): + """Fixes problems with differences in greek accent encoding. + + Certain Greek accents have more than one possible encoding. Uses cltk's + built-in normalizer to correct the character encoding differences and + ensure that accents are encoded the same way. + + Returns: + :obj:`self.__class__` New instance with altered text + + Example: + >>> text = AncientGreekText('ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι') + >>> print(text.normalize()) + ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι + """ # noqa + from cltk.corpus.utils.formatter import cltk_normalize + return self.__class__( + text=cltk_normalize(str(self.data)), + options=self.options + ) + + def tlgu_cleanup(self, rm_punctuation=True, rm_periods=False): + """Fix TLG betacode texts using TLGU. + + Necessary to cleanup TLG texts before processing, but can also used to + perform rudimentary cleaning operations on other Greek texts. + + Args: + rm_punctuation (:obj:`bool`, optional) True to remove punctuation marks (exception periods) + rm_periods (:obj:`bool`, optional) True to remove periods + + Returns: + :obj:`self.__class__` New instance with altered text + + Example: + >>> text = AncientGreekText('ῖν», εἰς δὲ τὸν ἕτερον κ[α]ττίτ[ερον «εἰ λῶιον καὶ ἄμει]νόν ἐστι') + >>> print(text.tlgu_cleanup()) + ῖν εἰς δὲ τὸν ἕτερον καττίτερον εἰ λῶιον καὶ ἄμεινόν ἐστι + """ # noqa + from cltk.corpus.utils.formatter import tlg_plaintext_cleanup + return self.__class__( + text=tlg_plaintext_cleanup( + self.data, rm_punctuation=rm_punctuation, rm_periods=rm_periods + ), + options=self.options + ) + + def tag(self, mode='123'): + """Gives words marked up with parts-of-speech. + + Override's the cltk POS tagger and uses cltk's instead. Has different + methods for providing a POS tagger, if desired. + + Args: + mode (:obj:`str`) Tagging mode, either '123', or 'tnt' + + Returns: + :obj:`list` of :obj:`tuple` 2-tuples with word, part-of-speech + + Example: + >>> text = AncientGreekText('ἔστι δὲ σύμπαντα ταῦτα τὰ συγγράμματα ἐκείνῃ μάλιστα οὐκ ὠφέλιμα, ὅτι ὡς πρὸς εἰδότας συγγέγραπται.') + >>> print(text.tag()) + [('ἔστι', 'V3SPIA---'), ('δὲ', 'G--------'), ('σύμπαντα', None), ('ταῦτα', 'A-P---NA-'), ('τὰ', 'L-P---NA-'), ('συγγράμματα', None), ('ἐκείνῃ', 'A-S---FD-'), ('μάλιστα', 'D--------'), ('οὐκ', 'D--------'), ('ὠφέλιμα', None), (',', 'U--------'), ('ὅτι', 'C--------'), ('ὡς', 'C--------'), ('πρὸς', 'R--------'), ('εἰδότας', 'T-PRPAMA-'), ('συγγέγραπται', None), ('.', '---------')] + """ # noqa + from cltk.tag.pos import POSTag + tagger = POSTag(self.options['language']) + mode = mode.lower() + if mode != '123' and mode != 'tnt': + raise Exception( + 'Invalid part of speech tagging mode specified.' + ) + elif mode == '123': + return tagger.tag_ngram_123_backoff(self.data) + elif mode == 'tnt': + return tagger.tag_tnt(self.data) diff --git a/dhelp/text/english.py b/dhelp/text/english.py deleted file mode 100644 index e29aba1..0000000 --- a/dhelp/text/english.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/user/bin/python - -from ._bases_mixins import BaseText, NLTKMixin - - -class EnglishText(NLTKMixin, BaseText): - """Main class to interact with English-language texts. - - EnglishText provides methods for altering texts for pre-processing as well - as numerous nlp methods for analyzing the text. Text alteration methods - can be chained since they each return a new instance of the class created - with the altered text. - - Args: - text (:obj:`str`) Main text data - options (:obj:`dict`, optional) keyword/value dict for optional settings - - Attributes: - data (:obj:`str`) Main text data - options (:obj:`dict`, optional) keyword/value dict for optional settings - - Methods: - - Example: - >>> english_text = EnglishText('Th3e Qui\\nck b rown fox jumped over the lazy dog') - >>> english_text.rm_lines().rm_nonchars().rm_spaces() - The quick brown fox jumped over the lazy dog - """ # noqa - - def __init__(self, text, options={}): - options['language'] = 'english' - super().__init__(text=text, options=options) diff --git a/dhelp/text/latin.py b/dhelp/text/latin.py deleted file mode 100644 index 2d8fe11..0000000 --- a/dhelp/text/latin.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/python - -from ._bases_mixins import BaseText, CLTKMixin - - -class LatinText(CLTKMixin, BaseText): - """Main class to interact with Latin-language texts. - - Provides Latin-specific CLTK functions for text passed upon construction. - Most methods return a new version of the text, except those that give - non-text results (e.g. pos tagging) - - Example: - >>> from dhelp import LatinText - >>> text = LatinText('Gallia est omnis divisa in partes tres') - >>> print(text.lemmatize()) - gallia edo1 omne divido in pars tres - """ - - def __init__(self, text, options={}): - options['language'] = 'latin' - super().__init__(text=text, options=options) - - def macronize(self, mode='tag_ngram_123_backoff'): - """Adds macrons (long vowel marks). - - Macrons distinguish long vowels from short. Distinguishing them is - critical for the study of Latin poetry and occasionally is important - in prose. Note that once you add macrons, long vowels are, for all - intents and purposes, different letters than their short equivalents. - - Args: - mode (:obj:`str`, optional) POS tagging method to use, 'tag_ngram_123_backoff', 'tag_tnt', or 'tag_crf' - - Returns: - :obj:`self.__class__` New text with macrons added to long vowels - - Example: - >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris') - >>> print(text.macronize()) - arma virumque cano , trojae quī prīmus ab ōrīs - """ # noqa - from cltk.prosody.latin.macronizer import Macronizer - mode = mode.lower() - if ( - mode != 'tag_ngram_123_backoff' and - mode != 'tag_tnt' and - mode != 'tag_crf' - ): - return False - return self.__class__( - Macronizer(tagger=mode).macronize_text(self.data), - self.options - ) - - def normalize(self): - """Replaces 'j's with 'i's and 'v's with 'u's. - - Ancient texts did not use j's or 'v's (viz. Indiana Jones and the Last - Crusade), but their usage in modern texts can throw off word counts, - pattern mataching, and general text-analysis methods. This method - converts these letters to their ancient versions. - - Returns: - :obj:`self.__class__` New text with macrons added to long vowels - - Example: - >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris') - >>> print(text.normalize()) - Arma uirumque cano, Troiae qui primus ab oris - """ # noqa - from cltk.stem.latin.j_v import JVReplacer - return self.__class__( - JVReplacer().replace(self.data), - self.options - ) - - def stemmify(self): - """Returns text with only stems. - - An alternate method to lemmatization. Instead of converting to lemmata - (principi -> princeps) converts to stemma (principi -> princp) - - Returns: - :obj:`self.__class__` New text with stemma - - Example: - >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris') - >>> print(text.stemmify()) - arm vir cano, troi qui prim ab or - """ # noqa - from cltk.stem.latin.stem import Stemmer - return self.__class__( - Stemmer().stem(self.data.lower()), - self.options - ) - - def clausulae(self): - """Counts different kinds of prose clausulae. - - Examines prose for evidence for poetic rythms (clausulae). Returns a - keyword/value dict with total counts for each kind of clausula. - - Returns: - :obj:`list` of `str` Individual clausulae results - - Example: - >>> text = LatinText('Arma virumque cano, Troiae qui primus ab oris') - >>> print(text.clausulae()) - {'cretic + trochee': 0, '4th paeon + trochee': 0, '1st paeon + trochee': 0, 'substituted cretic + trochee': 0, '1st paeon + anapest': 0, 'double cretic': 0, '4th paeon + cretic': 0, 'molossus + cretic': 0, 'double trochee': 0, 'molossus + double trochee': 0, 'cretic + double trochee': 0, 'dactyl + double trochee': 0, 'choriamb + double trochee': 0, 'cretic + iamb': 0, 'molossus + iamb': 0, 'double spondee': 0, 'cretic + double spondee': 0, 'heroic': 0} - """ # noqa - from cltk.prosody.latin.clausulae_analysis import Clausulae - return Clausulae().clausulae_analysis(self.data) diff --git a/dhelp/text/nltk.py b/dhelp/text/nltk.py new file mode 100644 index 0000000..6aaa001 --- /dev/null +++ b/dhelp/text/nltk.py @@ -0,0 +1,268 @@ +#!/usr/bin/python + +import os + +import nltk +from nltk.text import Text +from nltk.tokenize.punkt import PunktLanguageVars +from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize +from nltk.util import ngrams, bigrams, trigrams, skipgrams +from nltk.stem.wordnet import WordNetLemmatizer +from nltk import pos_tag + +from .. import settings +from ._bases import BaseText + + +class NLTKMixin: + """Mixin for NLTK-related functions. + + Mixin class which provides access to NLTK-specific functions. This class + should be mixed with some base class (e.g. EnglishText) to give it nlp + related functions. + + Example: + >>> class EnglishText(NLTKTextMixin, EnglishText): + """ + + @classmethod + def setup(self): + """Download NLTK packages and trainer corpora. + + Launches the NLTK package download interface. Overridden by the CLTK + child classes to launch the automated CLTK downloader. Convenience + method if user has not already downloaded NLTK packages and trainer + sets. + + Example: + >>> EnglishText.setup() + """ + for package, package_path_segments in settings.NLTK_PACKAGES[ + 'english' + ]: + package_path = os.sep.join(package_path_segments) + # will trigger error if no file, if file found, do nothing + try: + nltk.data.find(package_path) + pass + # if no file was found, download the respective package + except: + nltk.download(package) + return True + + def rm_stopwords(self, stoplist=[]): + """Removes words or phrases from the text. + + Given a list of words or phrases, gives new text with those phrases + removed. + + Args: + stoplist (:obj:`list`) List of words or phrases to filter from text + + Returns: + :obj:`self.__class__` New version of text, with stop words/phrases removed + + Example: + >>> stopwords = ['ipsum', 'sit'] + >>> text = EnglishText('Lorem ipsum dolor sit amet...') + >>> text.rm_stopwords(stoplist=stopwords) + >>> print(modified_text) + 'Lorem dolor amet...' + """ # noqa + filtered_words = [] + # converts text to list of words with NLTK tokenizer + tokenizer = PunktLanguageVars() + tokens = tokenizer.word_tokenize(str(self.data)) + # loop through each word, if not in stoplist, append + for word in tokens: + not_found = True + for stopword in stoplist: + if str(word).strip().lower() == str(stopword).strip().lower(): + not_found = False + if not_found: + filtered_words.append(word) + # return rejoined word + return self.__class__( + " ".join(filtered_words), + self.options + ) + + def lemmatize(self): + """Transforms words into their lemmata. + + Gives a new version of the text in which every word is lemmatized. All + verbs are transformed into the first person singular present active, + all nouns are transformed into the singular masculine nominative, et.c. + + Returns: + :obj:`self.__class__` New version of the text with tokens transformed to their lemmata + + Example: + >>> text = EnglishText('The quick brown fox jumped over the lazy dog.') + >>> print(text.lemmatize()) + 'The quick brown fox jump over the lazy dog .' + """ # noqa + tagged_words = self.tag() + lemmata = [] + lemmatizer = WordNetLemmatizer() + for word, parsing in tagged_words: + # Grab main part of speech from first character in POS + pos = parsing[0] + try: + lemmatized_word = lemmatizer.lemmatize( + word.lower(), pos=pos.lower()[0] + ) + except: + lemmatized_word = word + lemmata.append(lemmatized_word) + return self.__class__( + " ".join(lemmata), + self.options + ) + + def tokenize(self, mode='word'): + """ Splits words (or sentences) into lists of strings + + Returns a tokenized list. By default returns list of words, but can + also return as a list of sentences. + + Args: + mode (:obj:`str`) Specifies tokenize mode, either 'word', 'sentence', or 'wordpunct' + + Returns: + :obj:`list` List of (string) tokens + + Example: + >>> text = EnglishText('Lorem ipsum dolor sit amet. Consectetur adipiscing elit.') # noqa + >>> print(EnglishText.tokenize()) + ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', '.', 'Consectetur', 'adipiscing', 'elit', '.'] + >>> print(EnglishText.tokenize(mode='sentence')) + ['Lorem ipsum dolor sit amet.', 'Consectetur adipiscing elit.'] + """ # noqa + if mode == 'sentence': + return ( + sent_tokenize(self.data) + ) + elif mode == 'wordpunct': + return wordpunct_tokenize(self.data) + else: + return word_tokenize(self.data) + + def tag(self): + """Performs part-of-speech analysis on the text. + + Returns list of words marked up with parts of speech. Each word is + returned as a 2-tuple, the first containing the word, the second with + the parts of speech. + + Returns: + (:obj:`list`) Words tagged as 2-tuples (word|part of speech) + + Example: + >>> text = EnglishText('They hated to think of sample sentences.') + >>> basic_tags = text.tag() + >>> print(basic_tags) + [('They', 'PRP'), ('hated', 'VBD'), ('to', 'TO'), ('think', 'VB'), ('of', 'IN'), ('sample', 'JJ'), ('sentences', 'NNS'), ('.', '.')] + """ # noqa + word_list = list(self.tokenize()) + return pos_tag(word_list) + + def ngrams(self, gram_size=3): + """Gives ngrams. + + Returns a list of ngrams, each ngram represented as a tuple. + + Args: + gram_size (:obj:`int`, optional) Size of the ngrams to generate + + Returns: + :obj:`list` of :obj:`tuple` Words of each ngram + + Example: + >>> text = EnglishText('They hated to think of sample sentences.') + >>> basic_ngrams = text.ngrams() + >>> print(basic_ngrams) + [('They', 'hated', 'to'), ('hated', 'to', 'think'), ('to', 'think', 'of'), ('think', 'of', 'sample'), ('of', 'sample', 'sentences'), ('sample', 'sentences', '.')] + """ # noqa + tokens = self.tokenize() + if gram_size < 2: # pragma: no cover + gram_size = 2 + if gram_size == 2: # pragma: no cover + return list(bigrams(tokens)) + if gram_size == 3: + return list(trigrams(tokens)) + else: # pragma: no cover + return list(ngrams(tokens, gram_size)) + + def skipgrams(self, gram_size=3, skip_size=1): + """Gives skipgrams. + + Returns list of skipgrams, similar to ngram, but allows spacing between + tokens. + + Args: + gram_size (:obj:`int`, optional) Size of the ngrams to generate + skip_size (:obj:`int`, optional) Size of max spacing allowed + + Returns: + :obj:`list` of :obj:`tuple` Words of each skipgram + + Example: + >>> text = EnglishText('They hated to think of sample sentences.') + >>> basic_skipgrams = text.skipgrams() + >>> print(basic_skipgrams) + [('They', 'hated', 'to'), ('They', 'hated', 'think'), ('They', 'to', 'think'), ('hated', 'to', 'think'), ('hated', 'to', 'of'), ('hated', 'think', 'of'), ('to', 'think', 'of'), ('to', 'think', 'sample'), ('to', 'of', 'sample'), ('think', 'of', 'sample'), ('think', 'of', 'sentences'), ('think', 'sample', 'sentences'), ('of', 'sample', 'sentences'), ('of', 'sample', '.'), ('of', 'sentences', '.'), ('sample', 'sentences', '.')] # noqa + """ + tokens = self.tokenize() + return list(skipgrams(tokens, gram_size, skip_size)) + + def word_count(self, word=None): + """Returns counter dictionary with word counts at respective keywords. + + Performs word counts and then stores their values in the respective + keyword of a counter dictionary. If a word is passed, a simple integer + count of the number of appearances is returned. + + Args: + word (:obj:`string`, optional) A single word you want to count + + Returns: + :obj:`dict` A dictionary with word counts stored in respective keywords + + Example: + >>> # TODO: + """ # noqa + counts = dict(Text(self.tokenize()).vocab()) + # If a single word was specified, only return that frequency + if word: + return counts[word] + return counts + + +class EnglishText(NLTKMixin, BaseText): + """Main class to interact with English-language texts. + + EnglishText provides methods for altering texts for pre-processing as well + as numerous nlp methods for analyzing the text. Text alteration methods + can be chained since they each return a new instance of the class created + with the altered text. + + Args: + text (:obj:`str`) Main text data + options (:obj:`dict`, optional) keyword/value dict for optional settings + + Attributes: + data (:obj:`str`) Main text data + options (:obj:`dict`, optional) keyword/value dict for optional settings + + Methods: + + Example: + >>> english_text = EnglishText('Th3e Qui\\nck b rown fox jumped over the lazy dog') + >>> english_text.rm_lines().rm_nonchars().rm_spaces() + The quick brown fox jumped over the lazy dog + """ # noqa + + def __init__(self, text, options={}): + options['language'] = 'english' + super().__init__(text=text, options=options) diff --git a/dhelp/text/tests/test_ancient_greek.py b/dhelp/text/tests/test_ancient_greek.py index d5da5d0..0fd2a70 100644 --- a/dhelp/text/tests/test_ancient_greek.py +++ b/dhelp/text/tests/test_ancient_greek.py @@ -2,14 +2,23 @@ import unittest -from ..ancient_greek import AncientGreekText +import os + +from ..cltk import AncientGreekText class AncientGreekSetupLayer: @classmethod def setUp(cls): - AncientGreekText('').setup() + if not os.path.exists( + os.path.join( + os.path.expanduser('~'), + 'cltk_data', + 'greek' + ) + ): + AncientGreekText('').setup() class TestAncientGreekText(unittest.TestCase): diff --git a/dhelp/text/tests/test_english.py b/dhelp/text/tests/test_english.py index 4b11909..c1b2b35 100644 --- a/dhelp/text/tests/test_english.py +++ b/dhelp/text/tests/test_english.py @@ -2,14 +2,14 @@ import unittest -from ..english import EnglishText +from ..nltk import EnglishText class EnglishSetupLayer: @classmethod - def setUp(cls): - EnglishText('').setup() + def testSetUp(cls): + EnglishText.setup() class TestEnglishText(unittest.TestCase): diff --git a/dhelp/text/tests/test_latin.py b/dhelp/text/tests/test_latin.py index 47c87af..68ba16d 100644 --- a/dhelp/text/tests/test_latin.py +++ b/dhelp/text/tests/test_latin.py @@ -2,14 +2,23 @@ import unittest -from ..latin import LatinText +import os + +from ..cltk import LatinText class LatinSetupLayer: @classmethod def setUp(cls): - LatinText('').setup() + if not os.path.exists( + os.path.join( + os.path.expanduser('~'), + 'cltk_data', + 'latin' + ) + ): + LatinText('').setup() class TestLatinText(unittest.TestCase): diff --git a/dhelp/web/web_page.py b/dhelp/web.py similarity index 81% rename from dhelp/web/web_page.py rename to dhelp/web.py index 3ed8d90..f85e6f5 100644 --- a/dhelp/web/web_page.py +++ b/dhelp/web.py @@ -29,6 +29,7 @@ class WebPage(UserString): ... 'delay': 4, 'max_retries': 3, 'silent': True + 'parser': 'html.parser' ... } >>> web_page = WebPage('https://stackoverflow.com', options=options) https://stackoverflow.com @@ -45,10 +46,17 @@ def __init__(self, url, options={}): options['max_retries'] = 0 if 'silent' not in options: options['silent'] = False + if 'parser' not in options: + options['parser'] = 'html.parser' self.data = url - self.delay = options['delay'] - self.max_retries = options['max_retries'] - self.silent = options['silent'] + self.options = options + + def __enter__(self): + return self.soup() + + def __exit__(self, ctx_type, ctx_value, ctx_traceback): + if not self.options['silent']: + print('Successfully scraped', self.data) def fetch(self, retry_counter=0): """Returns http request from URL as a string. @@ -71,32 +79,33 @@ def fetch(self, retry_counter=0): Examples: >>> html_text = WebPage('https://stackoverflow.com/').fetch() - \\r\\n\\r\\n\r\\n \\r\\n\r\\n Stack Overflow... + <!DOCTYPE html>\\r\\n<html>\\r\\n\\r\\n <head>\\r\\n\\r\\n <title>Stack Overflow... """ # noqa # print message unless silent option - if not self.silent: + if not self.options['silent']: print('Fetching', self.data) # enforce delay to reduce server load - time.sleep(self.delay) + time.sleep(self.options['delay']) # attempt to fetch web page try: request = requests.get(self.data) # if error in getting page, call self recursively to try again except Exception: - print('Problem fetching', self.data) + if not self.options['silent']: + print('Problem fetching', self.data) # if infinite retries is set, always try again - if not self.max_retries: - if not self.silent: + if not self.options['max_retries']: + if not self.options['silent']: print('Retrying...') return self.fetch() # if below retry limit, return recursively and increment counter - elif retry_counter <= self.max_retries: - if not self.silent: + elif retry_counter <= self.options['max_retries']: + if not self.options['silent']: print('Retrying') return self.fetch(retry_counter=retry_counter+1) # otherwise retry limit has been hit, stop fetching else: - if not self.silent: + if not self.options['silent']: print('Retry limit reached, skipping', self.data) return None # if everything ok, returning page html instead of the entire request @@ -123,4 +132,4 @@ def soup(self): >>> print(header_logo_text.get_text()) Stack Overflow """ # noqa - return BeautifulSoup(self.fetch(), 'html.parser') + return BeautifulSoup(self.fetch(), self.options['parser']) diff --git a/dhelp/web/__init__.py b/dhelp/web/__init__.py deleted file mode 100644 index 013e4b7..0000000 --- a/dhelp/web/__init__.py +++ /dev/null @@ -1 +0,0 @@ -#!/usr/bin/python diff --git a/dhelp/web/tests/__init__.py b/dhelp/web/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/dhelp/web/tests/test_web_page.py b/dhelp/web/tests/test_web_page.py deleted file mode 100644 index 973ca31..0000000 --- a/dhelp/web/tests/test_web_page.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/python - -import unittest - -from bs4 import BeautifulSoup - -from ..web_page import WebPage - - -class TestWebPage(unittest.TestCase): - page = WebPage('https://stackoverflow.com', options={'silent': True}) - - def test_fetch(self): - # ensure request returns text data - return self.assertTrue(len(self.page.fetch()) > 0) - - def test_soup(self): - # ensure object is a BeautifulSoup type object - return self.assertTrue(type(self.page.soup()) == BeautifulSoup) diff --git a/docs/source/dhelp.files.rst b/docs/source/dhelp.files.rst deleted file mode 100644 index aed29ff..0000000 --- a/docs/source/dhelp.files.rst +++ /dev/null @@ -1,39 +0,0 @@ -dhelp.files package -=================== - -Submodules ----------- - -.. automodule:: dhelp.files.csv_file - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: dhelp.files.folder - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: dhelp.files.path - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: dhelp.files.text_file - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: dhelp.files.text_folder - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: dhelp.files - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/dhelp.rst b/docs/source/dhelp.rst index be9801b..6bc7cc8 100644 --- a/docs/source/dhelp.rst +++ b/docs/source/dhelp.rst @@ -1,14 +1,41 @@ dhelp package ============= -Subpackages ------------ +Submodules +---------- -.. toctree:: +dhelp.files module +------------------ + +.. automodule:: dhelp.files + :members: + :undoc-members: + :show-inheritance: + +dhelp.settings module +--------------------- + +.. automodule:: dhelp.settings + :members: + :undoc-members: + :show-inheritance: + +dhelp.text module +----------------- + +.. automodule:: dhelp.text + :members: + :undoc-members: + :show-inheritance: + +dhelp.web module +---------------- + +.. automodule:: dhelp.web + :members: + :undoc-members: + :show-inheritance: - dhelp.files - dhelp.text - dhelp.web Module contents --------------- diff --git a/docs/source/dhelp.text.rst b/docs/source/dhelp.text.rst deleted file mode 100644 index 3ab7b58..0000000 --- a/docs/source/dhelp.text.rst +++ /dev/null @@ -1,29 +0,0 @@ -dhelp.text package -================== - -Submodules ----------- - -.. automodule:: dhelp.text.ancient_greek - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: dhelp.text.english - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: dhelp.text.latin - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: dhelp.text - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/dhelp.web.rst b/docs/source/dhelp.web.rst deleted file mode 100644 index 7d65ec8..0000000 --- a/docs/source/dhelp.web.rst +++ /dev/null @@ -1,19 +0,0 @@ -dhelp.web package -================= - -Submodules ----------- - -.. automodule:: dhelp.web.web_page - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: dhelp.web - :members: - :undoc-members: - :show-inheritance: diff --git a/setup.py b/setup.py index e989cc6..49d1b4c 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ 'author_email': 'dave.a.base@gmail.com', 'description': """DH Python tools for scraping web pages, pre-processing data, and performing nlp analysis quickly.""", - 'version': '0.0.3', + 'version': '0.0.4', 'LICENSE': 'MIT', 'long_description': """Students often see great potential in Python for historical analysis. But, before they see real payoff they often face too