diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..457a2fd --- /dev/null +++ b/.editorconfig @@ -0,0 +1,39 @@ +# EditorConfig is awesome: http://EditorConfig.org + +# Howto with your editor: +# Sublime: https://github.com/sindresorhus/editorconfig-sublime + +# top-most EditorConfig file +root = true + +# Unix-style newlines with a newline ending every file +[**] +end_of_line = lf +insert_final_newline = true + +# Standard at: https://github.com/felixge/node-style-guide +[**.js, **.json] +trim_trailing_whitespace = true +indent_style = tab +quote_type = single +curly_bracket_next_line = false +spaces_around_operators = true +space_after_control_statements = true +space_after_anonymous_functions = false +spaces_in_brackets = false + +indent_size = 4 +insert_final_newline = true + +# No Standard. Please document a standard if different from .js +[**.yml, **.html, **.css] +trim_trailing_whitespace = true +indent_style = tab + +# No standard. Please document a standard if different from .js +[**.md] +indent_style = tab + +# Standard at: +[Makefile] +indent_style = tab diff --git a/.gitignore b/.gitignore index ce32b00..0e4f46a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,19 +1,187 @@ -# OS Files -.DS_Store -~* +# ==================================================================SYSTEM FILES -# Python Files -__pycache__ +# ===== WINDOWS +Thumbs.db +ehthumbs.db +Desktop.ini +$RECYCLE.BIN/ +*.cab +*.msi +*.msm +*.msp +*.lnk +# ===== OSX +*.DS_Store +.AppleDouble +.LSOverride +Icon +._* +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk +# ===== LINUX +.fuse_hidden* +.directory +.Trash-* + +# ============================================== SETTINGS, RULES, AND MISC FILES + +# ===== SUBLIME +*.sublime-project +*.sublime-workspace +sftp-config.json +*.tmlanguage.cache +*.tmPreferences.cache +*.stTheme.cache +Package Control.last-run +Package Control.ca-list +Package Control.ca-bundle +Package Control.system-ca-bundle +Package Control.cache/ +Package Control.ca-certs/ +bh_unicode_properties.cache +GitHub.sublime-settings +# ===== VIM +# swap +[._]*.s[a-w][a-z] +[._]s[a-w][a-z] +# session +Session.vim +# temporary +.netrwhist +*~ +# auto-generated tag files +tags +# Visual Studio +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +# ===== OTHERS +.idea/ +.project +.cproject +.nodemonignore + + +# ======================================================================= PYTHON +# PYTHON CORE +__pycache__/ *.pyc +*.py[cod] +*$py.class +*.pcd +qtcreator-* +*.user +*.sqlite3 +.so +pip-log.txt +pip-delete-this-directory.txt +pip-selfcheck.json +# ===== DJANGO +local_settings.py +# ===== SPHINX +docs/_build/ +# ===== PYINSTALLER +*.manifest +*.spec +# ===== PYBUILDER +target/ +# ===== JEKYLL +_site/ +.jekyll-metadata +# ===== SCRAPY +.scrapy +# ===== IPYTHON +.ipynb_checkpoints/ +.python-version +# ===== CELERY +celerybeat-schedule +# ===== VIRTUAL ENVIRONMENTS +.env +.venv/ +venv/ +ENV/ +.Python +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +.venv +# ===== TRANSLATION +*.mo +*.pot +public/i18n/*.js + +# =========================================================== TESTING & COVERAGE +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + + +# =========================================================== AUTOMATION & TASKS +# ===== GRUNT + +# ===== GULP + +# ===== LARAVEL/HOMESTEAD +Homestead.yaml +Homestead.json +/public/css/ +/public/js/ +/vendor -# Build Files -*.egg-info -MANIFEST -htmlcov -build -dist +# ===== SASS/LASS +.sass-cache/ +*.css.map +# ===== OTHER GENERATED DOCS +*.dox +*.wikidoc +# ================================================================== VIRTUAL BOX +# ===== VAGRANT +.vagrant/ +# ===== ANSIBLE +.secrets.yml -# Testing files -.pypirc -.testing* +# =================================================== DISTRUBTION & NODEJS/BOWER +bin/ +bower_components/ +build/ +develop=eggs/ +dist/ +downloads/ +eggs/ +env/ +include/ +local/ +lib/ +lib64/ +man/ +node_modules/ +parts/ +sdist/ +tmp/ +.installed.cfg +.eggs/ +*.log +*.egg-info/ +*.egg diff --git a/.travis.yml b/.travis.yml index 1dda4b3..82ebe58 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ # test in python language: python -sudo: false # python versions tested python: - "3.5" @@ -8,21 +7,23 @@ python: - "3.6.2" # enable the use of sudo sudo: required -# operating system to use +# set os os: linux -# distribution to use +# set distribution dist: trusty -# setting environment variables +# set environment variables env: - PACKAGE_VERSION=0.0.1 -# command to install dependencies +# install dependencies install: - pip install -r requirements-dev.txt -# command to run tests +# launch test discovery and run tests script: - nose2 +# send results to coveralls.io after_success: coveralls +# never give email notifications notifications: on_success: never on_failure: never diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..53e4f0f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,25 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) +and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +* Additions... + * files module + * text module + * web module + +## [0.0.1] + +* Changes... + * Project renamed to dhelp + * Each module passing tests(see below) + * Project documentation added to README.md +* Additions... + * Unit testing for files, text, and web modules added + * Test coverage added with computer-readable (.coverage) and human-friendly (an html folder, htmlcov) + * Continuous Integration support for Travis-CI and Coveralls + * .editorconfig added to enforce project standards + * To document changes, both past and future, added CHANGELOG.md and TODO.md diff --git a/README.md b/README.md index 9c4d4eb..1326300 100644 --- a/README.md +++ b/README.md @@ -25,99 +25,502 @@ pip install dhelp --- -# Examples +# Documentation and Examples + +Full project documentation hosted on readthedocs.io coming soon. + +**Table of Contents** + +* [dhelp.web]#dhelp.web) + * [WebPage()](#WebPage()) + * [.scrape()](#WebPage().scrape()) +* [dhelp.files](#dhelp.files) + * [TextFile()](#TextFile()) + * [.load()](#TextFile().load()) + * [.save()](#TextFile().save()) + * [TextFolder()](#TextFolder()) + * [.text_files()](#TextFolder().text_files()) + * [.modify()](#TextFolder().text_files()) + * [CSVFile()](#CSVFile()) + * [.load()](#CSVFile().load()) + * [.save()](#CSVFile().save()) + * [.modify()](#CSVFile().modify()) + * [.column_to_txts()](#CSVFile().column_to_txts) +* [dhelp.text](#dhelp.text) + * [BasicText](#BasicText) + * [.stringify()](#BasicText.stringify()) + * [.rm_lines()](#BasicText.rm_lines()) + * [.rm_nonchars()](#BasicText.rm_nonchars()) + * [.rm_edits()](#BasicText.rm_edits()) + * [.rm_spaces()](#BasicText.rm_spaces()) + * [.rm_stopwords()](#BasicText.rm_stopwords()) + * [.lemmatize()](#BasicText.lemmatize()) + * [.re_search()](#BasicText.re_search()) + * [.tokenize()](#BasicText.tokenize()) + * [.tag()](#BasicText.tag()) + * [.ngrams()](#BasicText.ngrams()) + * [.skipgrams()](#BasicText.skipgrams()) --- -## Text Files +## dhelp.web -Load and save plain text data to/from files with TextFile. +Module for scraping and processing web data. Primarily used for scraping web +pages. + +Contains: + +* [WebPage()](#WebPage()) + +--- + +### WebPage() + +Provides methods to download/parse a specified webpage. Merges the request +package with [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) +functions to enable users to request/soup +a page in a single line. ```python -from dhelp import TextFile +from dhelp import WebPage -# load file as a string -file_data = TextFile('some/path.txt').load() -# remove all endlines -file_data = file_data.replace('\n') -# save altered text, specifying the overwrite option -TextFile('some/path.txt').save(file_data, options={'overwrite': True}) +web_page = WebPage('https://stackoverflow.com') + +# if printed to screen, WebPage will display the url to which it is connected +>>> print(web_page) +https://stackoverflow.com ``` +#### WebPage.soup() + +Invokes web request then returns a soup object loaded with page HTML + +```python + +# fetch webpage and parse into BeautifulSoup object +>>> parsed_webpage = WebPage('https://stackoverflow.com/').soup() + +# grab the logo from the header with BeautifulSoup +>>> header_logo_text = parsed_webpage.find('header') + .find('div', class_='-main') + .find('span', class_='-img') + +# print the text contained in the span tag +>>> print(header_logo_text.get_text()) +Stack Overflow +``` + --- -## Text Folders +## Module: dhelp.files + +Module for loading/saving/modifying individual or groups of files. Primarily +used for loading of plain text files (individually or by folder) or CSVs. + +--- -Modify entire folders of plain text data easily with TextFolder. You can use -.load() to get a list of TextFile objects, each one already linked to the -location of a file in the folder. Better yet, to streamline text processing, -you can use .modify(). Once you define a function showing how to alter the data -of a single file, you can then load, modify, and save every single file in -a folder in a single line of code. +### TextFile() + +Load and save plain text data to/from files with TextFile. Loads data located +at given path as a string. Likewise if .save() will save string data at the +system path send to TextFile ```python -from dhelp import TextFolder +>>> from dhelp import TextFile -# load .txts in folder as list of TextFile objs, each linked to file loc -folder_files = TextFolder('some/folder').load() -# loop through each TextFile object, load data and print -for folder_file in folder_files: - print(folder_file.load()) +>>> text_file = TextFile('some/path.txt') + +# if printed to screen, TextFile will display the system path to which it is connected +>>> print(text_file) +some/path.txt +``` + +#### TextFile.load() + +Opens file and returns contents as a single string. + +```python + +>>> from dhelp import TextFile + +# load data from path and print +>>> file_data = TextFile('some/path.txt').load() +>>> print(file_data) +Lorem ipsum dolor sit amet... + +``` + +#### TextFile.save() + +Saves string data to file, won't overwrite unless option is flagged. + +```python + +>>> from dhelp import TextFile + +# save string data to path then print path +>>> saved_text_file = TextFile('some/path.txt').save('Lorem ipsum dolor sit amet...') +>>> print(saved_text_file) +/absolute/path/to/some/path.txt + +# overwrite previous data by passing options dictionary with 'overwrite' set +>>> options = {'overwrite': True} +>>> saved_text_file = saved_text_file.save('consectetur adipiscing elit', options=options) +>>> print(saved_text_file) +/absolute/path/to/some/path.txt + + +``` + +--- + +### TextFolder() + +Can load or save a folder of plaintext files as a list of strings. Also enables +batch editing of an entire directory by passing a callback. + +```python +>>> from dhelp import TextFolder + +>>> text_folder = TextFolder('some/path') + +# like TextFile, if printed to screen, TextFolder will display the system path to which it is connected +>>> print(text_folder) +some/path ``` -Setting options... +#### TextFolder.text_files() + +Load all .txt (or other types) in a folder as list of TextFile objects. ```python -# options dict, specifying all possible options, including output dir -options_settings = { - 'encoding': 'utf-8', - 'extensions': ['txt', 'html', 'rtf'] -} -# pass the dict as the options argument -TextFolder('some/folder').load(options=options_settings) + +# load .txts in folder as list of TextFile objs, each linked to file loc +>>> folder_files = TextFolder('some/path').text_files() +# loop through each TextFile object, load data and print +>>> for folder_file in folder_files: +>>> print(folder_file.load()) +Lorem ipsum dolor sit amet... + ``` -Modify all files, simplified +#### TextFolder.modify() + +Opens every file and performs a callback function sent to it. Provides a fast +means of batch editing an entire folder of txt files. Returns a new TextFolder +linked with the modified copy. ```python -# make a function defining how to modify the data of a single record -# function must have single arg (e.g. record_data), alter it, and return it -def modify_record(record_data): - # remove all endlines - record_data = record_data.replace('\n', '') - # return altered data - return record_data + +# make a function with a single arg which defines how to modify a single record +>>> def modify_record(record_data): +>>> record_data = record_data.replace('\n', '') +>>> return record_data # if you don't specify destination, a backup will automatically be created -options_settings = {'destination': 'some/other-folder'} +>>> options = {'destination': 'some/other-path'} # use TextFolder().modify, pass your function as 1st arg -# TextFolder will then load, modify, and save each file automatically! -TextFolder('some/folder').modify(modify_record, options=options_settings) +>>> text_folder = TextFolder('some/path').modify(modify_record, options=options) +>>> print(text_folder) +/absolute/path/to/some/path ``` --- -## CSV Files +### CSVFile() + +Makes loading and saving CSV data a simple matter. Simplifies the use of the +csv.DictReader and csv.DictWriter for loading or saving csv's as lists of +dictionaries. ```python +>>> from dhelp import CSVFile + +>>> csv_file = CSVFile('some/path.csv') + +# if printed to screen, CSVFile will display the system path to which it is connected +>>> print(csv_file) +/absolute/path/to/some/path.csv + ``` ---- +#### CSVFile().fieldnames -## Web Pages +Opens CSV file and reads the first row to get column names. ```python +>>> csv_file = CSVFile('some/path.csv') +>>> print(csv_file.fieldnames) +['id', 'text', 'notes'] + +``` + +#### CSVFile().load() + +Load csv data as list of dictionaries. + +```python + +>>> csv_file = CSVFile('some/path.csv') +>>> csv_data = CSVFile.load() +>>> print(csv_data) +[{'id': '1', 'text': 'Lorem ipsum', 'notes': ''}, {'id': '2', 'text': 'dolor sit', 'notes': ''}, {'id': '3', 'text': 'amet.', 'notes': ''}] + +``` + +#### CSVFile().save() + +Save a list of dictionaries to a .csv file. You must specify +the column headers (fieldnames) with a list of strings. Returns True +upon success. + +```python +>>> fake_fieldnames = ['id', 'text', 'notes'] +>>> fake_data = [{ + 'id': '1', + 'text': 'Lorem ipsum', + 'notes': '' + }, { + 'id': '2', + 'text': 'dolor sit', + 'notes': '' + }, { + 'id': '3', + 'text': 'amet.', + 'notes': '' + }] +>>> csv_file = CSVFile('some/path.csv').save(fake_data, fieldnames=fake_fieldnames) +>>> print(csv_file) +/absolute/path/to/some/path.csv + + +``` + +#### CSVFile().modify() + +Copies CSV to destination then performs the modify_cb callback +function passed on each data row before saving the file. Quick way +to perform batch changes to a CSV. Returns new CSVFile object linked +to modified CSV. + +```python + +# define a function which describes how to modify any given data row +>>> def modify_function(csv_record): +>>> csv_record['text'] = 'Lorem ipsum dolor sit amet...' +>>> csv_record['notes'] = 'Edited with dhelp' +>>> return csv_record + +# pass a destination and your function as arguments to .modify() +>>> csv_file = CSVFile('some/path.csv') +>>> altered_csv_file = csv_file.modify('some/other-path.csv', modify_cb=modify_function) + +# .modify will return a new CSVFile object tied to the new location +>>> print(altered_csv_file) +/absolute/path/to/some/other-path.csv + ``` --- -## Natural Language Processing +## dhelp.text + +Module for text processing and natural language processing, primarily using +the natural language toolkit (nltk). + +--- + +### BasicText() + +Base class for all Text objects. Can be used on its own to perform a number +of operations, although it is best used with on of its language-specific +children. ```python +>>> from dhelp import BasicText + +>>> basic_text = BasicText('Lorem ipsum dolor sit amet...') +>>> print(basic_text) +Lorem ipsum dolor sit amet... + +``` + +#### BasicText().stringify() + +Returns the text of this object as a pure string type. + +```python + +>>> basic_text = BasicText('Lorem ipsum dolor sit amet...') +>>> stringified_text = basic_text.stringify() +>>> print(type(stringified_text)) + + + +``` + +#### BasicText().rm_lines() + +Gives a new version of the text with all endlines removed. Removes +any dashed line endings and rejoins split words. + +```python + +>>> basic_text = BasicText('Lorem\nipsum do-\nlor sit amet....\n') +>>> modified_text = basic_text.rm_lines() +>>> print(modified_text) +Lorem ipsum dolor sit amet... + +``` + +#### BasicText().rm_nonchars() + +Gives a new version of the text with only latin characters remaining. +Is overriden by child objects for languages using non latinate chars. + +```python + +>>> basic_text = BasicText('1αLorem ipsum 2βdolor sit 3γamet...') +>>> modified_text = basic_text.rm_nonchars() +>>> print(modified_text) +Lorem ipsum dolor sit amet... + +``` + +#### BasicText().rm_edits() + +Gives a new version with any text between editorial marks such as +brackets or parentheses removed. NOTE: May not work exactly as shown below, +the way this method deals with spaces inside or outside editorial marks needs +improvement. + +```python + +>>> basic_text = BasicText('Lore[m i]psum {dolo}r sit a(met)...') +>>> modified_text = basic_text.rm_edits() +>>> print(modified_text) +Lor psum r sit a... + +``` + +#### BasicText().rm_spaces() + +Gives a new version of the text with extra whitespace collapsed. + +```python + +>>> basic_text = BasicText('Lorem ipsum dolor sit amet...') +>>> modified_text = basic_text.rm_spaces() +>>> print(modified_text) +Lorem ipsum dolor sit amet... + +``` + +#### BasicText().rm_stopwords() + +Given a list of words or phrases, gives new text with those phrases +removed. + +```python + +>>> stopwords = ['ipsum', 'sit'] +>>> basic_text = BasicText('Lorem ipsum dolor sit amet...') +>>> modified_text = basic_text.rm_stopwords(stoplist=stopwords) +>>> print(modified_text) +Lorem dolor amet... + + +``` + +#### BasicText().lemmatize() + +Gives a new version of the text in which every word is lemmatized. All verbs +are transformed into the first person singular present active, all nouns are +transformed into the singular masculine nominative, et.c. NOTE: May not work +exactly as below as this will be tweaked. + +```python + +>>> basic_text = BasicText('They hated to think of sample sentences.') +>>> modified_text = basic_text.lemmatize() +>>> print(modified_text) +I hate think of sample sentence. + + +``` + +#### BasicText().re_search() + +Receives a [Regular Expression](https://regexr.com/) search pattern and returns +True/False if it matches. + +```python + +>>> basic_text = BasicText('Lorem ipsum dolor sit amet...') +>>> print(basic_text.re_search('Lorem ipsum')) +True +>>> print(basic_text.re_search('Arma virumque cano')) +False + +``` + +#### BasicText().tokenize() + +Returns a tokenized list. By default returns list of words, but can also return +as a list of sentences. + +```python + +>>> basic_text = BasicText('Lorem ipsum dolor sit amet. Consectetur adipiscing elit.') +>>> print(BasicText.tokenize()) +['Lorem', 'ipsum', 'dolor', 'sit', 'amet', '.', 'Consectetur', 'adipiscing', 'elit', '.'] +>>> print(BasicText.tokenize(mode='sentence')) +['Lorem ipsum dolor sit amet.', 'Consectetur adipiscing elit.'] + +``` + +#### BasicText().tag() + +Returns list of words marked up with parts of speech. Each word is returned as +a 2-tuple, the first containing the word, the second with the parts of speech. + +```python + +>>> basic_text = BasicText('They hated to think of sample sentences.') +>>> basic_tags = basic_text.tag() +>>> print(basic_tags) +[('They', 'PRP'), ('hated', 'VBD'), ('to', 'TO'), ('think', 'VB'), ('of', 'IN'), ('sample', 'JJ'), ('sentences', 'NNS'), ('.', '.')] + +``` + +#### BasicText().ngrams() + +Returns a list of ngrams, each ngram represented as a tuple. + +```python + +>>> basic_text = BasicText('They hated to think of sample sentences.') +>>> basic_ngrams = basic_text.ngrams() +>>> print(basic_ngrams) +[('They', 'hated', 'to'), ('hated', 'to', 'think'), ('to', 'think', 'of'), ('think', 'of', 'sample'), ('of', 'sample', 'sentences'), ('sample', 'sentences', '.')] + +``` + +#### BasicText().skipgrams() + +Returns list of skipgrams, similar to ngram, but allows spacing between tokens. + +```python + +>>> basic_text = BasicText('They hated to think of sample sentences.') +>>> basic_skipgrams = basic_text.skipgrams() +>>> print(basic_skipgrams) +[('They', 'hated', 'to'), ('They', 'hated', 'think'), ('They', 'to', 'think'), ('hated', 'to', 'think'), ('hated', 'to', 'of'), ('hated', 'think', 'of'), ('to', 'think', 'of'), ('to', 'think', 'sample'), ('to', 'of', 'sample'), ('think', 'of', 'sample'), ('think', 'of', 'sentences'), ('think', 'sample', 'sentences'), ('of', 'sample', 'sentences'), ('of', 'sample', '.'), ('of', 'sentences', '.'), ('sample', 'sentences', '.')] + ``` diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..30932ee --- /dev/null +++ b/TODO.md @@ -0,0 +1,21 @@ +# todo + +# General or Current TODO Items + +* Revise and add to README.md, provide function-level documentation + +# TODO Items by Module + +* dhelp.files + * Add support for .pdf + * Add support for .tsv + * Add support for .xls, .xlsx +* dhelp.web + * Add support for PDFs + * Add support for other non-plaintext data, tbd +* dhelp.text + * Add english language-specific nltk text class + * Add cltk generalized text (or mix-in) class + * Add greek and latin language-specific text classes + * Add helper method to automate the downloading of nltk trainer sets + * Add helper method to automate the downloading of cltk trainer sets diff --git a/dhelp/__init__.py b/dhelp/__init__.py index 4f8ea9a..2c4bf5c 100644 --- a/dhelp/__init__.py +++ b/dhelp/__init__.py @@ -12,3 +12,4 @@ from .files import CSVFile, TextFile, TextFolder from .web import WebPage +from .text import BasicText diff --git a/dhelp/files/csv_file.py b/dhelp/files/csv_file.py index 0bd93dc..8e21d86 100644 --- a/dhelp/files/csv_file.py +++ b/dhelp/files/csv_file.py @@ -43,7 +43,7 @@ class CSVFile(Path): @property def fieldnames(self): """ - Opens CSV file and reads the first row to get column names + Opens CSV file and reads the first row to get column names. """ column_headers = [] with open(self.data, 'r+') as csv_file: @@ -82,7 +82,7 @@ def save(self, data, fieldnames, options={}): """ Save a list of dictionaries to a .csv file. You must specify the column headers (fieldnames) with a list of strings. Returns True - upon success + upon success. Example: fake_data = [ @@ -110,7 +110,7 @@ def modify(self, destination, modify_cb, options={}): Copies CSV to destination then performs the modify_cb callback function passed on each data row before saving the file. Quick way to perform batch changes to a CSV. Returns new CSVFile object linked - to modified CSV + to modified CSV. """ # create csv object tied to destination and empty deque for new data new_csv_file = self.__class__(destination) diff --git a/dhelp/files/text_file.py b/dhelp/files/text_file.py index d622b30..ad2d6ad 100644 --- a/dhelp/files/text_file.py +++ b/dhelp/files/text_file.py @@ -26,7 +26,7 @@ class TextFile(Path): def load(self, options={}): """ - Opens file and returns contents as a single string + Opens file and returns contents as a single string. """ # set option defaults if 'encoding' not in options: @@ -41,7 +41,7 @@ def load(self, options={}): def save(self, data, options={}): """ - Saves string data to file, won't overwrite unless option is flagged + Saves string data to file, won't overwrite unless option is flagged. """ # set option defaults if 'encoding' not in options: diff --git a/dhelp/files/text_folder.py b/dhelp/files/text_folder.py index 02dd17f..34e57c5 100644 --- a/dhelp/files/text_folder.py +++ b/dhelp/files/text_folder.py @@ -6,7 +6,6 @@ Object for interacting with a folder of plain text files. Allows quick discovery of filepaths and construction of relevant TextFile objects. -Also enables batch editing of an entire directory by passing a callback. """ @@ -19,7 +18,8 @@ class TextFolder(Folder): """ - Can load or save a folder of plaintext files as a list of strings. + Can load or save a folder of plaintext files as a list of strings. Also + enables batch editing of an entire directory by passing a callback. """ def text_files(self, options={}): diff --git a/dhelp/text/__init__.py b/dhelp/text/__init__.py index ca5866f..2c3ce3a 100644 --- a/dhelp/text/__init__.py +++ b/dhelp/text/__init__.py @@ -8,3 +8,5 @@ the natural language toolkit (nltk). """ + +from .basic_text import BasicText diff --git a/dhelp/text/text.py b/dhelp/text/basic_text.py similarity index 99% rename from dhelp/text/text.py rename to dhelp/text/basic_text.py index f1d69e3..330414c 100644 --- a/dhelp/text/text.py +++ b/dhelp/text/basic_text.py @@ -18,7 +18,7 @@ from nltk import pos_tag -class Text(UserString): +class BasicText(UserString): """ Base class for all Text objects. Can be used on its own to perform a number of operations, although it is best used with on of its language-specific @@ -117,6 +117,30 @@ def rm_stopwords(self, stoplist=[]): self.options ) + def lemmatize(self): + """ + Gives a new version of the text in which every word is lemmatized. All + verbs are transformed into the first person singular present active, + all nouns are transformed into the singular masculine nominative, et.c. + """ + tagged_words = self.tag() + lemmata = [] + lemmatizer = WordNetLemmatizer() + for word, parsing in tagged_words: + # Grab main part of speech from first character in POS + pos = parsing[0] + try: + lemmatized_word = lemmatizer.lemmatize( + word.lower(), pos=pos.lower()[0] + ) + except: + lemmatized_word = word + lemmata.append(lemmatized_word) + return self.__class__( + " ".join(lemmata), + self.options + ) + def re_search(self, pattern): """ Receives a RegEx search pattern and returns True/False if it matches. @@ -151,30 +175,6 @@ def tag(self): word_list = list(self.tokenize()) return pos_tag(word_list) - def lemmatize(self): - """ - Gives a new version of the text in which every word is lemmatized. All - verbs are transformed into the first person singular present active, - all nouns are transformed into the singular masculine nominative, et.c. - """ - tagged_words = self.tag() - lemmata = [] - lemmatizer = WordNetLemmatizer() - for word, parsing in tagged_words: - # Grab main part of speech from first character in POS - pos = parsing[0] - try: - lemmatized_word = lemmatizer.lemmatize( - word.lower(), pos=pos.lower()[0] - ) - except: - lemmatized_word = word - lemmata.append(lemmatized_word) - return self.__class__( - " ".join(lemmata), - self.options - ) - def ngrams(self, gram_size=3): """ Returns a list of ngrams, each ngram represented as a tuple diff --git a/dhelp/text/tests/test_text.py b/dhelp/text/tests/test_basic_text.py similarity index 62% rename from dhelp/text/tests/test_text.py rename to dhelp/text/tests/test_basic_text.py index c801d85..c45c3a6 100644 --- a/dhelp/text/tests/test_text.py +++ b/dhelp/text/tests/test_basic_text.py @@ -2,59 +2,59 @@ import unittest -from ..text import Text +from ..basic_text import BasicText -class TestText(unittest.TestCase): +class TestBasicText(unittest.TestCase): def test_stringify(self): # Should get type of string - exempla = Text("Lorem ipsum dolor sit amet") + exempla = BasicText("Lorem ipsum dolor sit amet") exempla = type(exempla.stringify()) comparanda = str return self.assertEqual(exempla, comparanda) def test_rm_lines(self): # should get version with endline replaced with space - exempla = Text("Lorem ipsum dolor\nsit amet") - comparanda = Text("Lorem ipsum dolor sit amet") + exempla = BasicText("Lorem ipsum dolor\nsit amet") + comparanda = BasicText("Lorem ipsum dolor sit amet") exempla = exempla.rm_lines() return self.assertEqual(exempla, comparanda) def test_rm_nonchars(self): # numbers should be removed - exempla = Text("Lorem1 ipsum2 dolor3 sit4 amet5") - comparanda = Text("Lorem ipsum dolor sit amet") + exempla = BasicText("Lorem1 ipsum2 dolor3 sit4 amet5") + comparanda = BasicText("Lorem ipsum dolor sit amet") exempla = exempla.rm_nonchars() return self.assertEqual(exempla, comparanda) def test_rm_edits(self): # text between brackets should be removed - exempla = Text("Lorem ipsum [dolor] sit amet") - comparanda = Text("Lorem ipsum sit amet") + exempla = BasicText("Lorem ipsum [dolor] sit amet") + comparanda = BasicText("Lorem ipsum sit amet") exempla = exempla.rm_edits() return self.assertEqual(exempla, comparanda) def test_rm_spaces(self): # redundant spaces should be gone - exempla = Text("Lorem ipsum dolor sit amet") - comparanda = Text("Lorem ipsum dolor sit amet") + exempla = BasicText("Lorem ipsum dolor sit amet") + comparanda = BasicText("Lorem ipsum dolor sit amet") exempla = exempla.rm_spaces() return self.assertEqual(exempla, comparanda) def test_rm_stopwords(self): # word in stopword list should be removed - exempla = Text("Lorem ipsum dolor sit amet") - comparanda = Text("Lorem ipsum sit amet") + exempla = BasicText("Lorem ipsum dolor sit amet") + comparanda = BasicText("Lorem ipsum sit amet") exempla = exempla.rm_stopwords(['dolor']) return self.assertEqual(exempla, comparanda) def test_re_search_present(self): # should be true as pattern is present - exempla = Text("Lorem ipsum dolor sit amet") + exempla = BasicText("Lorem ipsum dolor sit amet") return self.assertTrue(exempla.re_search('ipsum')) def test_re_search_not_present(self): # should be false as pattern is not present - exempla = Text("Lorem ipsum dolor sit amet") + exempla = BasicText("Lorem ipsum dolor sit amet") return self.assertFalse(exempla.re_search('Arma virumque cano')) diff --git a/requirements-cltk.txt b/requirements-cltk.txt new file mode 100644 index 0000000..589afeb --- /dev/null +++ b/requirements-cltk.txt @@ -0,0 +1,10 @@ +-r requirements.txt +cltk==0.1.83 +gitdb2==2.0.3 +GitPython==2.1.8 +python-crfsuite==0.9.5 +pyuca==1.2 +PyYAML==3.12 +regex==2018.2.21 +smmap2==2.0.3 +Whoosh==2.7.4