From 55b8b08172e0d6b4c72285d96f3969149662535b Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Wed, 15 Nov 2017 07:12:06 -0500 Subject: [PATCH 01/44] Rename LICENSE to LICENSE.txt --- LICENSE => LICENSE.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename LICENSE => LICENSE.txt (100%) diff --git a/LICENSE b/LICENSE.txt similarity index 100% rename from LICENSE rename to LICENSE.txt From feacc909974df2c23c2ace76de723f07f54658cc Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Sun, 2 Sep 2018 21:13:24 -0400 Subject: [PATCH 02/44] ADMIN: Setup test framework, copy in prior work Patterns from prior work copied in as comments. Need to be actually worked into functioning objects and tests. --- pent/patterns.py | 84 ++++++++++++++++ pent/test/__init__.py | 24 +++++ pent/test/pent_base.py | 214 +++++++++++++++++++++++++++++++++++++++++ requirements-dev.txt | 2 + tests.py | 109 +++++++++++++++++++++ 5 files changed, 433 insertions(+) create mode 100644 pent/patterns.py create mode 100644 pent/test/__init__.py create mode 100644 pent/test/pent_base.py create mode 100644 tests.py diff --git a/pent/patterns.py b/pent/patterns.py new file mode 100644 index 0000000..0ac68b9 --- /dev/null +++ b/pent/patterns.py @@ -0,0 +1,84 @@ +r"""``pyparsing`` *patterns for* ``pent``. + +``pent`` Extracts Numerical Text. + +**Author** + Brian Skinn (bskinn@alum.mit.edu) + +**File Created** + 2 Sep 2018 + +**Copyright** + \(c) Brian Skinn 2018 + +**Source Repository** + http://www.github.com/bskinn/pent + +**Documentation** + http://pent.readthedocs.io + +**License** + The MIT License; see |license_txt|_ for full license terms + +**Members** + +""" + +import pyparsing as pp + + +class Numbers: + """Patterns matching single numbers.""" + + pass + +# pyparsing patterns from initial work. Definitely remove the .WordStart +# and .WordEnd tokens from these core definitions. + +#~ ppps = {} +#~ num_punct = '+-.' +#~ ppps.update({Values.POSINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + + #~ pp.Optional('+') + + #~ pp.Word(pp.nums) + + #~ pp.WordEnd(pp.alphanums + num_punct))}) +#~ ppps.update({Values.NEGINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + + #~ pp.Literal('-') + + #~ pp.Word(pp.nums) + + #~ pp.WordEnd(pp.alphanums + num_punct))}) +#~ ppps.update({Values.ANYINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + + #~ pp.Optional(pp.Literal('+') ^ pp.Literal('-')) + + #~ pp.Word(pp.nums) + + #~ pp.WordEnd(pp.alphanums + num_punct))}) + +# Regex patterns from initial work: + +#~ # Integers (code i) +#~ strs.update({Values.POSINT: '[+]?\\d+'}) +#~ strs.update({Values.NEGINT: '-\\d+'}) +#~ strs.update({Values.ANYINT: '[-+]?\\d+'}) + +#~ # Floats (code f) +#~ strs.update({Values.POSFLOAT: '[+]?(\\d+\\.\\d*|\\d*\\.\\d+)'}) +#~ strs.update({Values.NEGFLOAT: '-(\\d+\\.\\d*|\\d*\\.\\d+)'}) +#~ strs.update({Values.ANYFLOAT: '[-+]?(\\d+\\.\\d*|\\d*\\.\\d+)'}) + +#~ # Scinot (code s; accepts both d and e for the exponent marker) +#~ strs.update({Values.POSSCI: '[+]?(\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) +#~ strs.update({Values.NEGSCI: '-(\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) +#~ strs.update({Values.ANYSCI: '[-+]?(\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) + +#~ # Float or scinot (code d, for ... decimal?) +#~ strs.update({Values.POSDEC: '[+]?(\\d+\\.\\d*|\\d*\\.\\d+|\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) +#~ strs.update({Values.NEGDEC: '-(\\d+\\.\\d*|\\d*\\.\\d+|\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[dDEe][-+]?\\d+)'}) +#~ strs.update({Values.ANYDEC: '[-+]?(\\d+\\.\\d*|\\d*\\.\\d+|\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) + +#~ # Any numerical value (code n) +#~ # This one is simpler than decimal because the first pattern option of the two matches integers (everything +#~ # after the initial `\\d+` is optional) +#~ strs.update({Values.POSNUM: '[+]?(\\d+\\.?\\d*([deDE][-+]?\\d+)?|\\d*\\.\\d+([deDE][-+]?\\d+)?)'}) +#~ strs.update({Values.NEGNUM: '-(\\d+\\.?\\d*([deDE][-+]?\\d+)?|\\d*\\.\\d+([deDE][-+]?\\d+)?)'}) +#~ strs.update({Values.ANYNUM: '[-+]?(\\d+\\.?\\d*([deDE][-+]?\\d+)?|\\d*\\.\\d+([deDE][-+]?\\d+)?)'}) + + +if __name__ == '__main__': + print("Module not executable.") diff --git a/pent/test/__init__.py b/pent/test/__init__.py new file mode 100644 index 0000000..12d8ea1 --- /dev/null +++ b/pent/test/__init__.py @@ -0,0 +1,24 @@ +# ------------------------------------------------------------------------------ +# Name: __init__ +# Purpose: Package submodule definition for the test suite +# +# Author: Brian Skinn +# bskinn@alum.mit.edu +# +# Created: 2 Sep 2018 +# Copyright: (c) Brian Skinn 2018 +# License: The MIT License; see "LICENSE.txt" for full license terms. +# +# https://www.github.com/bskinn/pent +# +# ------------------------------------------------------------------------------ + +"""Base submodule for the pent test suite.""" + +from __future__ import absolute_import + +__all__ = ['suite_expect_good', + ] + +from .pent_base import suite_expect_good + diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py new file mode 100644 index 0000000..150e56f --- /dev/null +++ b/pent/test/pent_base.py @@ -0,0 +1,214 @@ +# ------------------------------------------------------------------------------ +# Name: pent_base +# Purpose: Module defining common objects for pent tests +# +# Author: Brian Skinn +# bskinn@alum.mit.edu +# +# Created: 2 Sep 2018 +# Copyright: (c) Brian Skinn 2018 +# License: The MIT License; see "LICENSE.txt" for full license terms. +# +# https://www.github.com/bskinn/pent +# +# ------------------------------------------------------------------------------ + +"""Module defining common objects for pent tests.""" + + +import os +import os.path as osp +import unittest as ut + + +class TestPentCorePatterns(ut.TestCase): + """Confirming basic pattern matching of the core pyparsing patterns.""" + + def test_dummy_test(self): + self.assertTrue(True) + +# Test content from Jupyter testing + +#~ # Set of all test values +#~ vals = {'0': { + #~ Values.POSINT: True, Values.NEGINT: False, Values.ANYINT: True, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: False, Values.NEGDEC: False, Values.ANYDEC: False, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '-0': { + #~ Values.POSINT: False, Values.NEGINT: True, Values.ANYINT: True, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: False, Values.NEGDEC: False, Values.ANYDEC: False, + #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True + #~ }, + #~ '+0.': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '-.00': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: True, Values.ANYFLOAT: True, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: False, Values.NEGDEC: True, Values.ANYDEC: True, + #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True + #~ }, + #~ '+35': { + #~ Values.POSINT: True, Values.NEGINT: False, Values.ANYINT: True, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: False, Values.NEGDEC: False, Values.ANYDEC: False, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '23': { + #~ Values.POSINT: True, Values.NEGINT: False, Values.ANYINT: True, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: False, Values.NEGDEC: False, Values.ANYDEC: False, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '-12': { + #~ Values.POSINT: False, Values.NEGINT: True, Values.ANYINT: True, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: False, Values.NEGDEC: False, Values.ANYDEC: False, + #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True + #~ }, + #~ '.12': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '35.': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '+218.': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '+.355': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '0.23': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '-.22': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: True, Values.ANYFLOAT: True, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: False, Values.NEGDEC: True, Values.ANYDEC: True, + #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True + #~ }, + #~ '-234.': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: True, Values.ANYFLOAT: True, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: False, Values.NEGDEC: True, Values.ANYDEC: True, + #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True + #~ }, + #~ '-392.34': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: True, Values.ANYFLOAT: True, + #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, + #~ Values.POSDEC: False, Values.NEGDEC: True, Values.ANYDEC: True, + #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True + #~ }, + #~ '+3e3': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '+3e+3': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '+3e+003': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '3e+003': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '+3.e5': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '+2e-04': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '+.34e23': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '+.48e-2': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, + #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, + #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True + #~ }, + #~ '-2e-04': { + #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, + #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, + #~ Values.POSSCI: False, Values.NEGSCI: True, Values.ANYSCI: True, + #~ Values.POSDEC: False, Values.NEGDEC: True, Values.ANYDEC: True, + #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True + #~ } + #~ # INVALID VALUES... '+-0.349', complex(?), etc. + #~ } + + +def suite_expect_good(): + """Create and return the test suite for expect-good tests.""" + s = ut.TestSuite() + tl = ut.TestLoader() + s.addTests([tl.loadTestsFromTestCase(TestPentCorePatterns)]) + return s + + +if __name__ == '__main__': + print("Module not executable.") diff --git a/requirements-dev.txt b/requirements-dev.txt index 8697953..a7cde5a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,3 +7,5 @@ flake8-docstrings coverage tox wget +pyparsing + diff --git a/tests.py b/tests.py new file mode 100644 index 0000000..62982a6 --- /dev/null +++ b/tests.py @@ -0,0 +1,109 @@ +# ------------------------------------------------------------------------------ +# Name: tests +# Purpose: Master script for pent testing suite +# +# Author: Brian Skinn +# bskinn@alum.mit.edu +# +# Created: 2 Sep 2018 +# Copyright: (c) Brian Skinn 2018 +# License: The MIT License; see "LICENSE.txt" for full license terms. +# +# http://www.github.com/bskinn/pent +# +# ------------------------------------------------------------------------------ + + +class AP(object): + """ Container for arguments for selecting test suites. + + Also includes PFX, a helper string for substitution/formatting. + + """ + ALL = 'all' + + PFX = "--{0}" + + +def get_parser(): + import argparse + + # Create the parser + prs = argparse.ArgumentParser(description="Run tests for pent") + + # Verbosity argument + prs.add_argument('-v', action='store_true', + help="Show verbose output") + + # Test subgroups + + # Options without subgroups + prs.add_argument(AP.PFX.format(AP.ALL), '-a', + action='store_true', + help="Run all tests (overrides any other selections)") + + # Return the parser + return prs + + +def main(): + import sys + import unittest as ut + + import pent.test + + # Retrieve the parser + prs = get_parser() + + # Pull the dict of stored flags, saving the un-consumed args, and + # update sys.argv + ns, args_left = prs.parse_known_args() + params = vars(ns) + sys.argv = sys.argv[:1] + args_left + + # Create the empty test suite + ts = ut.TestSuite() + + # Helper function for adding test suites. Just uses ts and params from + # the main() function scope + def addsuiteif(suite, flags): + if any(params[k] for k in flags): + ts.addTest(suite) + + # Add commandline-indicated tests per-group + # Expect-good tests + addsuiteif(pent.test.pent_base.suite_expect_good(), + [AP.ALL]) + +# addsuiteif(sphobjinv.test.sphobjinv_api.suite_api_expect_good(), +# [AP.ALL, AP.LOCAL, AP.GOOD, AP.GOOD_LOCAL, +# AP.API, AP.API_LOCAL, AP.API_GOOD, AP.API_GOOD_LOCAL]) +# addsuiteif(sphobjinv.test.sphobjinv_api.suite_api_expect_good_nonlocal(), +# [AP.ALL, AP.GOOD, AP.API, AP.API_GOOD]) +# addsuiteif(sphobjinv.test.sphobjinv_cli.suite_cli_expect_good(), +# [AP.ALL, AP.LOCAL, AP.GOOD, AP.GOOD_LOCAL, +# AP.CLI, AP.CLI_LOCAL, AP.CLI_GOOD, AP.CLI_GOOD_LOCAL]) +# addsuiteif(sphobjinv.test.sphobjinv_cli.suite_cli_expect_good_nonlocal(), +# [AP.ALL, AP.GOOD, AP.CLI, AP.CLI_GOOD]) + +# # Expect-fail tests +# addsuiteif(sphobjinv.test.sphobjinv_api.suite_api_expect_fail(), +# [AP.ALL, AP.LOCAL, AP.FAIL, AP.API, AP.API_LOCAL, AP.API_FAIL]) +# addsuiteif(sphobjinv.test.sphobjinv_cli.suite_cli_expect_fail(), +# [AP.ALL, AP.LOCAL, AP.FAIL, AP.FAIL_LOCAL, +# AP.CLI, AP.CLI_LOCAL, AP.CLI_FAIL, AP.CLI_FAIL_LOCAL]) +# addsuiteif(sphobjinv.test.sphobjinv_cli.suite_cli_expect_fail_nonlocal(), +# [AP.ALL, AP.FAIL, AP.CLI, AP.CLI_FAIL]) + + # Create the test runner and execute + ttr = ut.TextTestRunner(buffer=True, + verbosity=(2 if params['v'] else 1), + ) + success = ttr.run(ts).wasSuccessful() + + # Return based on success result (lets tox report success/fail) + sys.exit(0 if success else 1) + + +if __name__ == '__main__': + main() From 9d76bcb391509ba0a0b784f208d6886d4f5a9cbe Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Mon, 3 Sep 2018 16:19:55 -0400 Subject: [PATCH 03/44] TEST: Rework number/sign test dataset Ready for implementation into tests, prior to actually constructing the various pyparsing patterns for them. --- pent/__init__.py | 52 ++--- pent/enums.py | 58 ++++++ pent/test/pent_base.py | 229 ++++----------------- pent/test/testdata.py | 443 +++++++++++++++++++++++++++++++++++++++++ tests.py | 61 +++--- 5 files changed, 594 insertions(+), 249 deletions(-) create mode 100644 pent/enums.py create mode 100644 pent/test/testdata.py diff --git a/pent/__init__.py b/pent/__init__.py index a1946a1..458e65f 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -1,28 +1,34 @@ -# ---------------------------------------------------------------------------- -# Name: __init__ -# Purpose: Package definition module for pent -# -# Author: Brian Skinn -# bskinn@alum.mit.edu -# -# Created: 14 Nov 2017 -# Copyright: (c) Brian Skinn 2017 -# License: The MIT License; see "LICENSE.txt" for full license terms -# and contributor agreement. -# -# This file is part of 'pent', a package for parsing of structured -# numerical text. -# -# http://www.github.com/bskinn/pent -# -# ---------------------------------------------------------------------------- - - -"""Definition file for root of pent.""" +r"""*Core package definition module for* ``pent``. + +``pent`` Extracts Numerical Text. + +**Author** + Brian Skinn (bskinn@alum.mit.edu) + +**File Created** + 3 Sep 2018 + +**Copyright** + \(c) Brian Skinn 2018 + +**Source Repository** + http://www.github.com/bskinn/pent + +**Documentation** + http://pent.readthedocs.io + +**License** + The MIT License; see |license_txt|_ for full license terms + +**Members** + +""" from __future__ import absolute_import -__all__ = [] +__all__ = ['Number', 'Sign'] + +from .enums import Number, Sign -__version__ = '0.0' +__version__ = '0.1dev1' diff --git a/pent/enums.py b/pent/enums.py new file mode 100644 index 0000000..0fa5070 --- /dev/null +++ b/pent/enums.py @@ -0,0 +1,58 @@ +r"""``Enums`` *for* ``pent``. + +``pent`` Extracts Numerical Text. + +**Author** + Brian Skinn (bskinn@alum.mit.edu) + +**File Created** + 3 Sep 2018 + +**Copyright** + \(c) Brian Skinn 2018 + +**Source Repository** + http://www.github.com/bskinn/pent + +**Documentation** + http://pent.readthedocs.io + +**License** + The MIT License; see |license_txt|_ for full license terms + +**Members** + +""" + +from enum import Enum + +class Number(Enum): + """Enumeration for the different kinds of recognized number primitives.""" + + #: Integer value; no decimal or scientific/exponential notation + Integer = 'int' + + #: Floating-point value; no scientific/exponential notation + Float = 'float' + + #: Scientific/exponential notation, where exponent is *required* + SciNot = 'sci' + + #: "Decimal" value; floating-point value with or without an exponent + Decimal = 'dec' + + #: "General" value; integer, float, or scientific notation + General = 'gen' + + +class Sign(Enum): + """Enumeration for the different kinds of recognized numerical signs.""" + + #: Positive value only (leading '+' optional; includes zero) + Positive = 'pos' + + #: Negative value only (leading '-' required; includes negative zero) + Negative = 'neg' + + #: Any sign + Any = 'any' diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 150e56f..5167742 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -1,21 +1,33 @@ -# ------------------------------------------------------------------------------ -# Name: pent_base -# Purpose: Module defining common objects for pent tests -# -# Author: Brian Skinn -# bskinn@alum.mit.edu -# -# Created: 2 Sep 2018 -# Copyright: (c) Brian Skinn 2018 -# License: The MIT License; see "LICENSE.txt" for full license terms. -# -# https://www.github.com/bskinn/pent -# -# ------------------------------------------------------------------------------ - -"""Module defining common objects for pent tests.""" +r"""*Test objects for* ``pent`` *test suite*. +``pent`` Extracts Numerical Text. +**Author** + Brian Skinn (bskinn@alum.mit.edu) + +**File Created** + 3 Sep 2018 + +**Copyright** + \(c) Brian Skinn 2018 + +**Source Repository** + http://www.github.com/bskinn/pent + +**Documentation** + http://pent.readthedocs.io + +**License** + The MIT License; see |license_txt|_ for full license terms + +**Members** + +*(none documented)* + +""" + + +import itertools as itt import os import os.path as osp import unittest as ut @@ -24,182 +36,15 @@ class TestPentCorePatterns(ut.TestCase): """Confirming basic pattern matching of the core pyparsing patterns.""" - def test_dummy_test(self): - self.assertTrue(True) - -# Test content from Jupyter testing - -#~ # Set of all test values -#~ vals = {'0': { - #~ Values.POSINT: True, Values.NEGINT: False, Values.ANYINT: True, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: False, Values.NEGDEC: False, Values.ANYDEC: False, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '-0': { - #~ Values.POSINT: False, Values.NEGINT: True, Values.ANYINT: True, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: False, Values.NEGDEC: False, Values.ANYDEC: False, - #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True - #~ }, - #~ '+0.': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '-.00': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: True, Values.ANYFLOAT: True, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: False, Values.NEGDEC: True, Values.ANYDEC: True, - #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True - #~ }, - #~ '+35': { - #~ Values.POSINT: True, Values.NEGINT: False, Values.ANYINT: True, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: False, Values.NEGDEC: False, Values.ANYDEC: False, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '23': { - #~ Values.POSINT: True, Values.NEGINT: False, Values.ANYINT: True, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: False, Values.NEGDEC: False, Values.ANYDEC: False, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '-12': { - #~ Values.POSINT: False, Values.NEGINT: True, Values.ANYINT: True, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: False, Values.NEGDEC: False, Values.ANYDEC: False, - #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True - #~ }, - #~ '.12': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '35.': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '+218.': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '+.355': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '0.23': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: True, Values.NEGFLOAT: False, Values.ANYFLOAT: True, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '-.22': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: True, Values.ANYFLOAT: True, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: False, Values.NEGDEC: True, Values.ANYDEC: True, - #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True - #~ }, - #~ '-234.': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: True, Values.ANYFLOAT: True, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: False, Values.NEGDEC: True, Values.ANYDEC: True, - #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True - #~ }, - #~ '-392.34': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: True, Values.ANYFLOAT: True, - #~ Values.POSSCI: False, Values.NEGSCI: False, Values.ANYSCI: False, - #~ Values.POSDEC: False, Values.NEGDEC: True, Values.ANYDEC: True, - #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True - #~ }, - #~ '+3e3': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '+3e+3': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '+3e+003': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '3e+003': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '+3.e5': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '+2e-04': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '+.34e23': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '+.48e-2': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: True, Values.NEGSCI: False, Values.ANYSCI: True, - #~ Values.POSDEC: True, Values.NEGDEC: False, Values.ANYDEC: True, - #~ Values.POSNUM: True, Values.NEGNUM: False, Values.ANYNUM: True - #~ }, - #~ '-2e-04': { - #~ Values.POSINT: False, Values.NEGINT: False, Values.ANYINT: False, - #~ Values.POSFLOAT: False, Values.NEGFLOAT: False, Values.ANYFLOAT: False, - #~ Values.POSSCI: False, Values.NEGSCI: True, Values.ANYSCI: True, - #~ Values.POSDEC: False, Values.NEGDEC: True, Values.ANYDEC: True, - #~ Values.POSNUM: False, Values.NEGNUM: True, Values.ANYNUM: True - #~ } - #~ # INVALID VALUES... '+-0.349', complex(?), etc. - #~ } + def test_number_and_sign_matching(self): + """Confirm number and sign patterns match the right string patterns.""" + from pent import Number, Sign + + from .testdata import number_sign_vals as vals + + for (n, s) in itt.product(Number, Sign): + pass + def suite_expect_good(): diff --git a/pent/test/testdata.py b/pent/test/testdata.py new file mode 100644 index 0000000..c6075b7 --- /dev/null +++ b/pent/test/testdata.py @@ -0,0 +1,443 @@ +r"""*Supporting test data for* ``pent`` *test suite*. + +``pent`` Extracts Numerical Text. + +**Author** + Brian Skinn (bskinn@alum.mit.edu) + +**File Created** + 3 Sep 2018 + +**Copyright** + \(c) Brian Skinn 2018 + +**Source Repository** + http://www.github.com/bskinn/pent + +**Documentation** + http://pent.readthedocs.io + +**License** + The MIT License; see |license_txt|_ for full license terms + +**Members** + +*(none documented)* + +""" + + +from pent import Number, Sign + + +number_sign_vals = { + '0': { + (Number.Integer, Sign.Positive): True, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): True, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '-0': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): True, + (Number.Integer, Sign.Any): True, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True + }, + '+0.': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '-.00': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): True, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True + }, + '+35': { + (Number.Integer, Sign.Positive): True, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): True, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '23': { + (Number.Integer, Sign.Positive): True, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): True, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '-12': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): True, + (Number.Integer, Sign.Any): True, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True + }, + '.12': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '35.': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '+218.': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '+.355': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '0.23': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '-.22': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): True, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True + }, + '-234.': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): True, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True + }, + '-392.34': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): True, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True + }, + '+3e3': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '+3e+3': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '+3e+003': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '3e+003': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '+3.e5': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '+2e-04': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '+.34e23': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '+.48e-2': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True + }, + '-2e-04': { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): True, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): True, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True + } + # INVALID VALUES... '+-0.349', complex(?), etc. +} diff --git a/tests.py b/tests.py index 62982a6..7e17e70 100644 --- a/tests.py +++ b/tests.py @@ -1,17 +1,30 @@ -# ------------------------------------------------------------------------------ -# Name: tests -# Purpose: Master script for pent testing suite -# -# Author: Brian Skinn -# bskinn@alum.mit.edu -# -# Created: 2 Sep 2018 -# Copyright: (c) Brian Skinn 2018 -# License: The MIT License; see "LICENSE.txt" for full license terms. -# -# http://www.github.com/bskinn/pent -# -# ------------------------------------------------------------------------------ +r"""*Master script for* ``pent`` *test suite*. + +``pent`` Extracts Numerical Text. + +**Author** + Brian Skinn (bskinn@alum.mit.edu) + +**File Created** + 3 Sep 2018 + +**Copyright** + \(c) Brian Skinn 2018 + +**Source Repository** + http://www.github.com/bskinn/pent + +**Documentation** + http://pent.readthedocs.io + +**License** + The MIT License; see |license_txt|_ for full license terms + +**Members** + +*(none documented)* + +""" class AP(object): @@ -75,26 +88,6 @@ def addsuiteif(suite, flags): addsuiteif(pent.test.pent_base.suite_expect_good(), [AP.ALL]) -# addsuiteif(sphobjinv.test.sphobjinv_api.suite_api_expect_good(), -# [AP.ALL, AP.LOCAL, AP.GOOD, AP.GOOD_LOCAL, -# AP.API, AP.API_LOCAL, AP.API_GOOD, AP.API_GOOD_LOCAL]) -# addsuiteif(sphobjinv.test.sphobjinv_api.suite_api_expect_good_nonlocal(), -# [AP.ALL, AP.GOOD, AP.API, AP.API_GOOD]) -# addsuiteif(sphobjinv.test.sphobjinv_cli.suite_cli_expect_good(), -# [AP.ALL, AP.LOCAL, AP.GOOD, AP.GOOD_LOCAL, -# AP.CLI, AP.CLI_LOCAL, AP.CLI_GOOD, AP.CLI_GOOD_LOCAL]) -# addsuiteif(sphobjinv.test.sphobjinv_cli.suite_cli_expect_good_nonlocal(), -# [AP.ALL, AP.GOOD, AP.CLI, AP.CLI_GOOD]) - -# # Expect-fail tests -# addsuiteif(sphobjinv.test.sphobjinv_api.suite_api_expect_fail(), -# [AP.ALL, AP.LOCAL, AP.FAIL, AP.API, AP.API_LOCAL, AP.API_FAIL]) -# addsuiteif(sphobjinv.test.sphobjinv_cli.suite_cli_expect_fail(), -# [AP.ALL, AP.LOCAL, AP.FAIL, AP.FAIL_LOCAL, -# AP.CLI, AP.CLI_LOCAL, AP.CLI_FAIL, AP.CLI_FAIL_LOCAL]) -# addsuiteif(sphobjinv.test.sphobjinv_cli.suite_cli_expect_fail_nonlocal(), -# [AP.ALL, AP.FAIL, AP.CLI, AP.CLI_FAIL]) - # Create the test runner and execute ttr = ut.TextTestRunner(buffer=True, verbosity=(2 if params['v'] else 1), From ac75b97f57bd9e23af41c4fffd18d40116444566 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Mon, 3 Sep 2018 21:44:45 -0400 Subject: [PATCH 04/44] ADMIN/TEST/DEV: Blacken; start pattern lay-in Going to be a LOT of logistical decisions as the pattern definition expands. Oy. --- black | 4 + black.bat | 4 + pent/__init__.py | 5 +- pent/enums.py | 17 +- pent/patterns.py | 100 +++-- pent/test/__init__.py | 4 +- pent/test/pent_base.py | 20 +- pent/test/testdata.py | 818 ++++++++++++++++++++--------------------- requirements-dev.txt | 1 + setup.py | 44 +-- tests.py | 24 +- 11 files changed, 541 insertions(+), 500 deletions(-) create mode 100755 black create mode 100644 black.bat diff --git a/black b/black new file mode 100755 index 0000000..910c829 --- /dev/null +++ b/black @@ -0,0 +1,4 @@ +#! /bin/bash + +black tests.py pent pent/test + diff --git a/black.bat b/black.bat new file mode 100644 index 0000000..c2cde03 --- /dev/null +++ b/black.bat @@ -0,0 +1,4 @@ +@echo off + +black tests.py pent pent\test + diff --git a/pent/__init__.py b/pent/__init__.py index 458e65f..9aba1c0 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -27,8 +27,9 @@ from __future__ import absolute_import -__all__ = ['Number', 'Sign'] +__all__ = ["Number", "Sign", "number_patterns", "wordify_pattern", "std_wordify"] from .enums import Number, Sign +from .patterns import number_patterns, wordify_pattern, std_wordify -__version__ = '0.1dev1' +__version__ = "0.1dev1" diff --git a/pent/enums.py b/pent/enums.py index 0fa5070..3cd85dc 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -26,33 +26,34 @@ from enum import Enum + class Number(Enum): """Enumeration for the different kinds of recognized number primitives.""" #: Integer value; no decimal or scientific/exponential notation - Integer = 'int' + Integer = "int" #: Floating-point value; no scientific/exponential notation - Float = 'float' + Float = "float" #: Scientific/exponential notation, where exponent is *required* - SciNot = 'sci' + SciNot = "sci" #: "Decimal" value; floating-point value with or without an exponent - Decimal = 'dec' + Decimal = "dec" #: "General" value; integer, float, or scientific notation - General = 'gen' + General = "gen" class Sign(Enum): """Enumeration for the different kinds of recognized numerical signs.""" #: Positive value only (leading '+' optional; includes zero) - Positive = 'pos' + Positive = "pos" #: Negative value only (leading '-' required; includes negative zero) - Negative = 'neg' + Negative = "neg" #: Any sign - Any = 'any' + Any = "any" diff --git a/pent/patterns.py b/pent/patterns.py index 0ac68b9..39e7266 100644 --- a/pent/patterns.py +++ b/pent/patterns.py @@ -26,59 +26,79 @@ import pyparsing as pp +from .enums import Number, Sign -class Numbers: - """Patterns matching single numbers.""" +#: |str| with the standard numerical punctuation to include as not +#: marking word boundaries. `de` is included to account for scientific +#: notation. +std_num_punct = "+-.de" + +#: |dict| of ``pyparsing`` patterns matching single numbers. +number_patterns = { + (Number.Integer, Sign.Positive): pp.Combine(pp.Optional("+") + pp.Word(pp.nums)) +} + + +def wordify_pattern(p, word_chars): + """Wrap a pattern with word start/end markers using arbitrary word chars.""" + ws = pp.WordStart(word_chars) + we = pp.WordEnd(word_chars) + + return pp.Combine(ws + p + we) + + +def std_wordify(p): + """Wrap a token in the ``pent`` standard word start/end markers.""" + return wordify_pattern(p, pp.nums + std_num_punct) - pass # pyparsing patterns from initial work. Definitely remove the .WordStart # and .WordEnd tokens from these core definitions. -#~ ppps = {} -#~ num_punct = '+-.' -#~ ppps.update({Values.POSINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + - #~ pp.Optional('+') + - #~ pp.Word(pp.nums) + - #~ pp.WordEnd(pp.alphanums + num_punct))}) -#~ ppps.update({Values.NEGINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + - #~ pp.Literal('-') + - #~ pp.Word(pp.nums) + - #~ pp.WordEnd(pp.alphanums + num_punct))}) -#~ ppps.update({Values.ANYINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + - #~ pp.Optional(pp.Literal('+') ^ pp.Literal('-')) + - #~ pp.Word(pp.nums) + - #~ pp.WordEnd(pp.alphanums + num_punct))}) +# ~ ppps = {} +# ~ num_punct = '+-.' +# ~ ppps.update({Values.POSINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + +# ~ pp.Optional('+') + +# ~ pp.Word(pp.nums) + +# ~ pp.WordEnd(pp.alphanums + num_punct))}) +# ~ ppps.update({Values.NEGINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + +# ~ pp.Literal('-') + +# ~ pp.Word(pp.nums) + +# ~ pp.WordEnd(pp.alphanums + num_punct))}) +# ~ ppps.update({Values.ANYINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + +# ~ pp.Optional(pp.Literal('+') ^ pp.Literal('-')) + +# ~ pp.Word(pp.nums) + +# ~ pp.WordEnd(pp.alphanums + num_punct))}) # Regex patterns from initial work: -#~ # Integers (code i) -#~ strs.update({Values.POSINT: '[+]?\\d+'}) -#~ strs.update({Values.NEGINT: '-\\d+'}) -#~ strs.update({Values.ANYINT: '[-+]?\\d+'}) +# ~ # Integers (code i) +# ~ strs.update({Values.POSINT: '[+]?\\d+'}) +# ~ strs.update({Values.NEGINT: '-\\d+'}) +# ~ strs.update({Values.ANYINT: '[-+]?\\d+'}) -#~ # Floats (code f) -#~ strs.update({Values.POSFLOAT: '[+]?(\\d+\\.\\d*|\\d*\\.\\d+)'}) -#~ strs.update({Values.NEGFLOAT: '-(\\d+\\.\\d*|\\d*\\.\\d+)'}) -#~ strs.update({Values.ANYFLOAT: '[-+]?(\\d+\\.\\d*|\\d*\\.\\d+)'}) +# ~ # Floats (code f) +# ~ strs.update({Values.POSFLOAT: '[+]?(\\d+\\.\\d*|\\d*\\.\\d+)'}) +# ~ strs.update({Values.NEGFLOAT: '-(\\d+\\.\\d*|\\d*\\.\\d+)'}) +# ~ strs.update({Values.ANYFLOAT: '[-+]?(\\d+\\.\\d*|\\d*\\.\\d+)'}) -#~ # Scinot (code s; accepts both d and e for the exponent marker) -#~ strs.update({Values.POSSCI: '[+]?(\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) -#~ strs.update({Values.NEGSCI: '-(\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) -#~ strs.update({Values.ANYSCI: '[-+]?(\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) +# ~ # Scinot (code s; accepts both d and e for the exponent marker) +# ~ strs.update({Values.POSSCI: '[+]?(\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) +# ~ strs.update({Values.NEGSCI: '-(\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) +# ~ strs.update({Values.ANYSCI: '[-+]?(\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) -#~ # Float or scinot (code d, for ... decimal?) -#~ strs.update({Values.POSDEC: '[+]?(\\d+\\.\\d*|\\d*\\.\\d+|\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) -#~ strs.update({Values.NEGDEC: '-(\\d+\\.\\d*|\\d*\\.\\d+|\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[dDEe][-+]?\\d+)'}) -#~ strs.update({Values.ANYDEC: '[-+]?(\\d+\\.\\d*|\\d*\\.\\d+|\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) +# ~ # Float or scinot (code d, for ... decimal?) +# ~ strs.update({Values.POSDEC: '[+]?(\\d+\\.\\d*|\\d*\\.\\d+|\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) +# ~ strs.update({Values.NEGDEC: '-(\\d+\\.\\d*|\\d*\\.\\d+|\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[dDEe][-+]?\\d+)'}) +# ~ strs.update({Values.ANYDEC: '[-+]?(\\d+\\.\\d*|\\d*\\.\\d+|\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) -#~ # Any numerical value (code n) -#~ # This one is simpler than decimal because the first pattern option of the two matches integers (everything -#~ # after the initial `\\d+` is optional) -#~ strs.update({Values.POSNUM: '[+]?(\\d+\\.?\\d*([deDE][-+]?\\d+)?|\\d*\\.\\d+([deDE][-+]?\\d+)?)'}) -#~ strs.update({Values.NEGNUM: '-(\\d+\\.?\\d*([deDE][-+]?\\d+)?|\\d*\\.\\d+([deDE][-+]?\\d+)?)'}) -#~ strs.update({Values.ANYNUM: '[-+]?(\\d+\\.?\\d*([deDE][-+]?\\d+)?|\\d*\\.\\d+([deDE][-+]?\\d+)?)'}) +# ~ # Any numerical value (code n) +# ~ # This one is simpler than decimal because the first pattern option of the two matches integers (everything +# ~ # after the initial `\\d+` is optional) +# ~ strs.update({Values.POSNUM: '[+]?(\\d+\\.?\\d*([deDE][-+]?\\d+)?|\\d*\\.\\d+([deDE][-+]?\\d+)?)'}) +# ~ strs.update({Values.NEGNUM: '-(\\d+\\.?\\d*([deDE][-+]?\\d+)?|\\d*\\.\\d+([deDE][-+]?\\d+)?)'}) +# ~ strs.update({Values.ANYNUM: '[-+]?(\\d+\\.?\\d*([deDE][-+]?\\d+)?|\\d*\\.\\d+([deDE][-+]?\\d+)?)'}) -if __name__ == '__main__': +if __name__ == "__main__": print("Module not executable.") diff --git a/pent/test/__init__.py b/pent/test/__init__.py index 12d8ea1..0909734 100644 --- a/pent/test/__init__.py +++ b/pent/test/__init__.py @@ -17,8 +17,6 @@ from __future__ import absolute_import -__all__ = ['suite_expect_good', - ] +__all__ = ["suite_expect_good"] from .pent_base import suite_expect_good - diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 5167742..e03ae5e 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -28,10 +28,10 @@ import itertools as itt -import os -import os.path as osp import unittest as ut +import pyparsing as pp + class TestPentCorePatterns(ut.TestCase): """Confirming basic pattern matching of the core pyparsing patterns.""" @@ -39,12 +39,22 @@ class TestPentCorePatterns(ut.TestCase): def test_number_and_sign_matching(self): """Confirm number and sign patterns match the right string patterns.""" from pent import Number, Sign + from pent import number_patterns from .testdata import number_sign_vals as vals - for (n, s) in itt.product(Number, Sign): - pass + for (v, n, s) in itt.product(vals, Number, Sign): + testname = "{0}_{1}_{2}".format(v, n, s) + with self.subTest(testname): + p = number_patterns[(n, s)] + try: + p.parseString(v) + except pp.ParseException: + res = False + else: + res = True + self.assertEqual(vals[v][(n, s)], res) def suite_expect_good(): @@ -55,5 +65,5 @@ def suite_expect_good(): return s -if __name__ == '__main__': +if __name__ == "__main__": print("Module not executable.") diff --git a/pent/test/testdata.py b/pent/test/testdata.py index c6075b7..199530e 100644 --- a/pent/test/testdata.py +++ b/pent/test/testdata.py @@ -31,413 +31,413 @@ number_sign_vals = { - '0': { - (Number.Integer, Sign.Positive): True, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): True, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): False, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): False, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '-0': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): True, - (Number.Integer, Sign.Any): True, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): False, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): False, - (Number.General, Sign.Positive): False, - (Number.General, Sign.Negative): True, - (Number.General, Sign.Any): True - }, - '+0.': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '-.00': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): True, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): False, - (Number.Decimal, Sign.Negative): True, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): False, - (Number.General, Sign.Negative): True, - (Number.General, Sign.Any): True - }, - '+35': { - (Number.Integer, Sign.Positive): True, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): True, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): False, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): False, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '23': { - (Number.Integer, Sign.Positive): True, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): True, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): False, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): False, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '-12': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): True, - (Number.Integer, Sign.Any): True, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): False, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): False, - (Number.General, Sign.Positive): False, - (Number.General, Sign.Negative): True, - (Number.General, Sign.Any): True - }, - '.12': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '35.': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '+218.': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '+.355': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '0.23': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '-.22': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): True, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): False, - (Number.Decimal, Sign.Negative): True, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): False, - (Number.General, Sign.Negative): True, - (Number.General, Sign.Any): True - }, - '-234.': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): True, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): False, - (Number.Decimal, Sign.Negative): True, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): False, - (Number.General, Sign.Negative): True, - (Number.General, Sign.Any): True - }, - '-392.34': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): True, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, - (Number.Decimal, Sign.Positive): False, - (Number.Decimal, Sign.Negative): True, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): False, - (Number.General, Sign.Negative): True, - (Number.General, Sign.Any): True - }, - '+3e3': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): True, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '+3e+3': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): True, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '+3e+003': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): True, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '3e+003': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): True, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '+3.e5': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): True, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '+2e-04': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): True, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '+.34e23': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): True, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '+.48e-2': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): True, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): True, - (Number.General, Sign.Negative): False, - (Number.General, Sign.Any): True - }, - '-2e-04': { - (Number.Integer, Sign.Positive): False, - (Number.Integer, Sign.Negative): False, - (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): True, - (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): False, - (Number.Decimal, Sign.Negative): True, - (Number.Decimal, Sign.Any): True, - (Number.General, Sign.Positive): False, - (Number.General, Sign.Negative): True, - (Number.General, Sign.Any): True - } - # INVALID VALUES... '+-0.349', complex(?), etc. + "0": { + (Number.Integer, Sign.Positive): True, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): True, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "-0": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): True, + (Number.Integer, Sign.Any): True, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True, + }, + "+0.": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "-.00": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): True, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True, + }, + "+35": { + (Number.Integer, Sign.Positive): True, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): True, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "23": { + (Number.Integer, Sign.Positive): True, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): True, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "-12": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): True, + (Number.Integer, Sign.Any): True, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True, + }, + ".12": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "35.": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "+218.": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "+.355": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "0.23": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "-.22": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): True, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True, + }, + "-234.": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): True, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True, + }, + "-392.34": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): True, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True, + }, + "+3e3": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "+3e+3": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "+3e+003": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "3e+003": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "+3.e5": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "+2e-04": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "+.34e23": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "+.48e-2": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): True, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): True, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): True, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): True, + }, + "-2e-04": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): True, + (Number.SciNot, Sign.Any): True, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): True, + (Number.Decimal, Sign.Any): True, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): True, + (Number.General, Sign.Any): True, + } + # INVALID VALUES... '+-0.349', complex(?), etc. } diff --git a/requirements-dev.txt b/requirements-dev.txt index a7cde5a..bcdb66d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,4 +8,5 @@ coverage tox wget pyparsing +black diff --git a/setup.py b/setup.py index 3f76e04..0f04050 100644 --- a/setup.py +++ b/setup.py @@ -2,30 +2,32 @@ def readme(): - with open('README.rst', 'r') as f: + with open("README.rst", "r") as f: return f.read() setup( - name='pent', - version='0.0', - packages=['pent'], - url='https://www.github.com/bskinn/pent', - license='MIT License', - author='Brian Skinn', - author_email='bskinn@alum.mit.edu', - description='Pent Extracts Numerical Text', + name="pent", + version="0.0", + packages=["pent"], + url="https://www.github.com/bskinn/pent", + license="MIT License", + author="Brian Skinn", + author_email="bskinn@alum.mit.edu", + description="Pent Extracts Numerical Text", long_description=readme(), - classifiers=['License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Intended Audience :: Science/Research', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Topic :: Scientific/Engineering', - 'Topic :: Scientific/Engineering :: Mathematics', - 'Topic :: Utilities', - 'Development Status :: 1 - Planning'], + classifiers=[ + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Intended Audience :: Science/Research", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Utilities", + "Development Status :: 1 - Planning", + ], ) diff --git a/tests.py b/tests.py index 7e17e70..cb43812 100644 --- a/tests.py +++ b/tests.py @@ -33,7 +33,8 @@ class AP(object): Also includes PFX, a helper string for substitution/formatting. """ - ALL = 'all' + + ALL = "all" PFX = "--{0}" @@ -45,15 +46,17 @@ def get_parser(): prs = argparse.ArgumentParser(description="Run tests for pent") # Verbosity argument - prs.add_argument('-v', action='store_true', - help="Show verbose output") + prs.add_argument("-v", action="store_true", help="Show verbose output") # Test subgroups # Options without subgroups - prs.add_argument(AP.PFX.format(AP.ALL), '-a', - action='store_true', - help="Run all tests (overrides any other selections)") + prs.add_argument( + AP.PFX.format(AP.ALL), + "-a", + action="store_true", + help="Run all tests (overrides any other selections)", + ) # Return the parser return prs @@ -85,18 +88,15 @@ def addsuiteif(suite, flags): # Add commandline-indicated tests per-group # Expect-good tests - addsuiteif(pent.test.pent_base.suite_expect_good(), - [AP.ALL]) + addsuiteif(pent.test.pent_base.suite_expect_good(), [AP.ALL]) # Create the test runner and execute - ttr = ut.TextTestRunner(buffer=True, - verbosity=(2 if params['v'] else 1), - ) + ttr = ut.TextTestRunner(buffer=True, verbosity=(2 if params["v"] else 1)) success = ttr.run(ts).wasSuccessful() # Return based on success result (lets tox report success/fail) sys.exit(0 if success else 1) -if __name__ == '__main__': +if __name__ == "__main__": main() From dc852f7f8f78df3f687985fbb10c6b44517b3de9 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Mon, 3 Sep 2018 21:52:24 -0400 Subject: [PATCH 05/44] DOC: Create docs stub. --- doc/Makefile | 20 +++++ doc/make.bat | 36 +++++++++ doc/source/conf.py | 174 +++++++++++++++++++++++++++++++++++++++++++ doc/source/index.rst | 20 +++++ 4 files changed, 250 insertions(+) create mode 100644 doc/Makefile create mode 100644 doc/make.bat create mode 100644 doc/source/conf.py create mode 100644 doc/source/index.rst diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..7e8794a --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = pent +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/doc/make.bat b/doc/make.bat new file mode 100644 index 0000000..e37613f --- /dev/null +++ b/doc/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build +set SPHINXPROJ=pent + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %2 +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/doc/source/conf.py b/doc/source/conf.py new file mode 100644 index 0000000..2b88ddb --- /dev/null +++ b/doc/source/conf.py @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'pent' +copyright = '2018, Brian Skinn' +author = 'Brian Skinn' + +# The short X.Y version +version = '' +# The full version, including alpha/beta/rc tags +release = '0.1' + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.mathjax', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pentdoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'pent.tex', 'pent Documentation', + 'Brian Skinn', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pent', 'pent Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'pent', 'pent Documentation', + author, 'pent', 'One line description of project.', + 'Miscellaneous'), +] + + +# -- Extension configuration ------------------------------------------------- + +# -- Options for intersphinx extension --------------------------------------- + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'https://docs.python.org/': None} + +# -- Options for todo extension ---------------------------------------------- + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True \ No newline at end of file diff --git a/doc/source/index.rst b/doc/source/index.rst new file mode 100644 index 0000000..11228ff --- /dev/null +++ b/doc/source/index.rst @@ -0,0 +1,20 @@ +.. pent documentation master file, created by + sphinx-quickstart on Mon Sep 3 21:50:56 2018. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to pent's documentation! +================================ + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` From 33251b25cf583fd5c114d1fca83024d3af890afe Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Mon, 3 Sep 2018 21:56:08 -0400 Subject: [PATCH 06/44] DEV: Add integer patterns. Change black line length. Tests seem ok! black now set at 80-character lines --- black | 2 +- black.bat | 2 +- pent/__init__.py | 8 +++++++- pent/patterns.py | 35 +++++++++++++---------------------- pent/test/pent_base.py | 11 ++++++----- requirements-dev.txt | 4 ++-- 6 files changed, 30 insertions(+), 32 deletions(-) diff --git a/black b/black index 910c829..04dfcc9 100755 --- a/black +++ b/black @@ -1,4 +1,4 @@ #! /bin/bash -black tests.py pent pent/test +black tests.py pent pent/test -l 80 diff --git a/black.bat b/black.bat index c2cde03..9bf8061 100644 --- a/black.bat +++ b/black.bat @@ -1,4 +1,4 @@ @echo off -black tests.py pent pent\test +black tests.py pent pent\test -l 80 diff --git a/pent/__init__.py b/pent/__init__.py index 9aba1c0..506117b 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -27,7 +27,13 @@ from __future__ import absolute_import -__all__ = ["Number", "Sign", "number_patterns", "wordify_pattern", "std_wordify"] +__all__ = [ + "Number", + "Sign", + "number_patterns", + "wordify_pattern", + "std_wordify", +] from .enums import Number, Sign from .patterns import number_patterns, wordify_pattern, std_wordify diff --git a/pent/patterns.py b/pent/patterns.py index 39e7266..61fdec7 100644 --- a/pent/patterns.py +++ b/pent/patterns.py @@ -33,11 +33,6 @@ #: notation. std_num_punct = "+-.de" -#: |dict| of ``pyparsing`` patterns matching single numbers. -number_patterns = { - (Number.Integer, Sign.Positive): pp.Combine(pp.Optional("+") + pp.Word(pp.nums)) -} - def wordify_pattern(p, word_chars): """Wrap a pattern with word start/end markers using arbitrary word chars.""" @@ -52,26 +47,22 @@ def std_wordify(p): return wordify_pattern(p, pp.nums + std_num_punct) +#: |dict| of ``pyparsing`` patterns matching single numbers. +number_patterns = { + (Number.Integer, Sign.Positive): pp.Combine( + pp.Optional("+") + pp.Word(pp.nums) + ), + (Number.Integer, Sign.Negative): pp.Combine( + pp.Literal("-") + pp.Word(pp.nums) + ), + (Number.Integer, Sign.Any): pp.Combine( + pp.Optional(pp.Literal("-") ^ pp.Literal("+")) + pp.Word(pp.nums) + ), +} + # pyparsing patterns from initial work. Definitely remove the .WordStart # and .WordEnd tokens from these core definitions. -# ~ ppps = {} -# ~ num_punct = '+-.' -# ~ ppps.update({Values.POSINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + -# ~ pp.Optional('+') + -# ~ pp.Word(pp.nums) + -# ~ pp.WordEnd(pp.alphanums + num_punct))}) -# ~ ppps.update({Values.NEGINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + -# ~ pp.Literal('-') + -# ~ pp.Word(pp.nums) + -# ~ pp.WordEnd(pp.alphanums + num_punct))}) -# ~ ppps.update({Values.ANYINT: pp.Combine(pp.WordStart(pp.alphanums + num_punct) + -# ~ pp.Optional(pp.Literal('+') ^ pp.Literal('-')) + -# ~ pp.Word(pp.nums) + -# ~ pp.WordEnd(pp.alphanums + num_punct))}) - -# Regex patterns from initial work: - # ~ # Integers (code i) # ~ strs.update({Values.POSINT: '[+]?\\d+'}) # ~ strs.update({Values.NEGINT: '-\\d+'}) diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index e03ae5e..b0cb391 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -38,17 +38,18 @@ class TestPentCorePatterns(ut.TestCase): def test_number_and_sign_matching(self): """Confirm number and sign patterns match the right string patterns.""" - from pent import Number, Sign - from pent import number_patterns + import pent from .testdata import number_sign_vals as vals - for (v, n, s) in itt.product(vals, Number, Sign): + for (v, n, s) in itt.product(vals, pent.Number, pent.Sign): testname = "{0}_{1}_{2}".format(v, n, s) with self.subTest(testname): - p = number_patterns[(n, s)] + npat = pent.number_patterns[(n, s)] + npat = pent.std_wordify(npat) + try: - p.parseString(v) + npat.parseString(v) except pp.ParseException: res = False else: diff --git a/requirements-dev.txt b/requirements-dev.txt index bcdb66d..fb17f42 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,6 @@ attrs<18 -sphinx==1.6.5 -sphinx_rtd_theme==0.2.4 +sphinx==1.7.8 +sphinx_rtd_theme==0.4.1 ipython flake8 flake8-docstrings From 8dc3c4d3900549bea117418fa3fb3220b7243507 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Mon, 3 Sep 2018 22:24:10 -0400 Subject: [PATCH 07/44] DEV/ADMIN: Black to 79char; add patterns pyparsing is definitely making adding patterns easier. Hopefully it has the flexibility I need to implement all the various downstream pattern recursion and substitution. --- black | 2 +- black.bat | 2 +- pent/patterns.py | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/black b/black index 04dfcc9..c38582f 100755 --- a/black +++ b/black @@ -1,4 +1,4 @@ #! /bin/bash -black tests.py pent pent/test -l 80 +black tests.py pent pent/test -l 79 diff --git a/black.bat b/black.bat index 9bf8061..40fb74e 100644 --- a/black.bat +++ b/black.bat @@ -1,4 +1,4 @@ @echo off -black tests.py pent pent\test -l 80 +black tests.py pent pent\test -l 79 diff --git a/pent/patterns.py b/pent/patterns.py index 61fdec7..a0e8879 100644 --- a/pent/patterns.py +++ b/pent/patterns.py @@ -28,10 +28,16 @@ from .enums import Number, Sign + +#: |str| with the standard allowed scientific notation exponent +#: marker characters +std_scinot_markers = "deDE" + + #: |str| with the standard numerical punctuation to include as not #: marking word boundaries. `de` is included to account for scientific #: notation. -std_num_punct = "+-.de" +std_num_punct = "+-." + std_scinot_markers def wordify_pattern(p, word_chars): @@ -47,6 +53,33 @@ def std_wordify(p): return wordify_pattern(p, pp.nums + std_num_punct) +_p_floatnums = pp.Or( + ( + pp.Word(pp.nums) + pp.Literal(".") + pp.Optional(pp.Word(pp.nums)), + pp.Optional(pp.Word(pp.nums)) + pp.Literal(".") + pp.Word(pp.nums), + ) +) + + +_p_scinums = pp.Or( + ( + pp.Word(pp.nums) + + pp.Optional(pp.Literal(".")) + + pp.Optional(pp.Word(pp.nums)) + + pp.Word(std_scinot_markers) + + pp.Optional(pp.Word("+-")) + + pp.Word(pp.nums), + pp.Optional(pp.Word(pp.nums)) + + pp.Literal(".") + + pp.Word(pp.nums) + + pp.Word(std_scinot_markers) + + pp.Optional(pp.Word("+-")) + + pp.Word(pp.nums), + ) +) +# ~ strs.update({Values.POSSCI: '[+]?(\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) + + #: |dict| of ``pyparsing`` patterns matching single numbers. number_patterns = { (Number.Integer, Sign.Positive): pp.Combine( @@ -56,7 +89,17 @@ def std_wordify(p): pp.Literal("-") + pp.Word(pp.nums) ), (Number.Integer, Sign.Any): pp.Combine( - pp.Optional(pp.Literal("-") ^ pp.Literal("+")) + pp.Word(pp.nums) + pp.Optional(pp.Word("-+")) + pp.Word(pp.nums) + ), + (Number.Float, Sign.Positive): pp.Combine(pp.Optional("+") + _p_floatnums), + (Number.Float, Sign.Negative): pp.Combine(pp.Literal("-") + _p_floatnums), + (Number.Float, Sign.Any): pp.Combine( + pp.Optional(pp.Word("+-")) + _p_floatnums + ), + (Number.SciNot, Sign.Positive): pp.Combine(pp.Optional("+") + _p_scinums), + (Number.SciNot, Sign.Negative): pp.Combine(pp.Literal("-") + _p_scinums), + (Number.SciNot, Sign.Any): pp.Combine( + pp.Optional(pp.Word("+-")) + _p_scinums ), } From 1670c37a842789b0111eb1caa03cc41ca6080c88 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Thu, 6 Sep 2018 00:15:16 -0400 Subject: [PATCH 08/44] ADMIN: Fix Windows black.bat Without specifying black.exe, the batch file just runs in an infinite loop! --- black.bat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/black.bat b/black.bat index 40fb74e..637bc4a 100644 --- a/black.bat +++ b/black.bat @@ -1,4 +1,4 @@ @echo off -black tests.py pent pent\test -l 79 +black.exe tests.py pent pent\test -l 79 From de294c135906481d6b3f2c8209e5d50a6f6fab1d Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Sat, 8 Sep 2018 22:42:48 -0400 Subject: [PATCH 09/44] DEV: Various; think pyparsing won't work Pretty sure pyparsing is meant for a different use-case, and it's just not going to work well here. Plan is to switch back to plain regex. --- .gitignore | 3 ++ doc/source/conf.py | 22 +++++++------ doc/source/isphx/objpull.py | 65 +++++++++++++++++++++++++++++++++++++ pent/patterns.py | 26 ++++++++++++--- pent/test/pent_base.py | 48 ++++++++++++++++++++++----- requirements-dev.txt | 1 + 6 files changed, 143 insertions(+), 22 deletions(-) create mode 100644 doc/source/isphx/objpull.py diff --git a/.gitignore b/.gitignore index ad39bf0..45fe288 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,6 @@ ENV/ *.bak *.tmp +# objects.inv in docs +doc/source/isphx/*.inv + diff --git a/doc/source/conf.py b/doc/source/conf.py index 2b88ddb..84f5665 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -12,9 +12,10 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os +import os.path as osp +import sys +sys.path.insert(0, osp.abspath(osp.join(os.pardir, os.pardir))) # -- Project information ----------------------------------------------------- @@ -24,16 +25,16 @@ author = 'Brian Skinn' # The short X.Y version -version = '' +version = '0.1' # The full version, including alpha/beta/rc tags -release = '0.1' +release = '0.1dev1' # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # -# needs_sphinx = '1.0' +needs_sphinx = '1.6' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -45,6 +46,7 @@ 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.mathjax', + 'sphinx_issues', ] # Add any paths that contain templates here, relative to this directory. @@ -80,7 +82,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -156,7 +158,7 @@ # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'pent', 'pent Documentation', - author, 'pent', 'One line description of project.', + author, 'pent', 'pent Extracts Numerical Text.', 'Miscellaneous'), ] @@ -166,9 +168,9 @@ # -- Options for intersphinx extension --------------------------------------- # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'https://docs.python.org/': None} +intersphinx_mapping = {'python': ('https://docs.python.org/', 'isphx/objects_python.inv')} # -- Options for todo extension ---------------------------------------------- # If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = True \ No newline at end of file +todo_include_todos = True diff --git a/doc/source/isphx/objpull.py b/doc/source/isphx/objpull.py new file mode 100644 index 0000000..5b7ac39 --- /dev/null +++ b/doc/source/isphx/objpull.py @@ -0,0 +1,65 @@ +# Quickie script for refreshing the local objects.inv cache +# OVERWRITES EXISTING FILES, WITH PRE-DELETION + + + +def pullobjs(): + + import os + import urllib.request as urlrq + + import certifi + + # Open conf.py, retrieve content and compile + with open(os.path.join(os.pardir, 'conf.py'), 'r') as f: + confcode = compile(f.read(), 'conf.py', 'exec') + + # Execute conf.py into the global namespace (I know, sloppy) + exec(confcode, globals()) + + # Define the file format string if not defined in conf.py + if not 'isphx_objstr' in dir(): + isphx_objstr = 'objects_{0}.inv' + + # Iterate intersphinx_mapping from conf.py to retrieve the objects.inv files + # Make use of the conf.py 'isphx_objstr' substitution string, too + for n, t in intersphinx_mapping.items(): + + print('{0}:\n'.format(n) + '-' * 16) + + try: + os.remove(isphx_objstr.format(n)) + except FileNotFoundError: + pass # No big deal + + try: + resp = urlrq.urlopen(t[0] + '/objects.inv', cafile=certifi.where()) + except Exception as e: + print('HTTP request failed:\n' + str(e) + '\n') + continue + else: + print('... located ...') + + try: + b_s = resp.read() + except Exception as e: + print('Download failed:\n' + str(e) + '\n') + continue + else: + print('... downloaded ...') + + try: + with open(isphx_objstr.format(n), 'wb') as f: + f.write(b_s) + except Exception as e: + print('Write failed:\n' + str(e) + '\n') + continue + else: + print('... done.') + + print('') + + +if __name__ == '__main__': + + pullobjs() diff --git a/pent/patterns.py b/pent/patterns.py index a0e8879..940b42e 100644 --- a/pent/patterns.py +++ b/pent/patterns.py @@ -53,6 +53,9 @@ def std_wordify(p): return wordify_pattern(p, pp.nums + std_num_punct) +_p_intnums = pp.Word(pp.nums) + + _p_floatnums = pp.Or( ( pp.Word(pp.nums) + pp.Literal(".") + pp.Optional(pp.Word(pp.nums)), @@ -77,19 +80,24 @@ def std_wordify(p): + pp.Word(pp.nums), ) ) -# ~ strs.update({Values.POSSCI: '[+]?(\\d+\\.?\\d*[deDE][-+]?\\d+|\\d*\\.\\d+[deDE][-+]?\\d+)'}) + + +_p_decimalnums = pp.Or((_p_floatnums, _p_scinums)) + + +_p_generalnums = pp.Or((_p_floatnums, _p_scinums, _p_intnums)) #: |dict| of ``pyparsing`` patterns matching single numbers. number_patterns = { (Number.Integer, Sign.Positive): pp.Combine( - pp.Optional("+") + pp.Word(pp.nums) + pp.Optional("+") + _p_intnums ), (Number.Integer, Sign.Negative): pp.Combine( - pp.Literal("-") + pp.Word(pp.nums) + pp.Literal("-") + _p_intnums ), (Number.Integer, Sign.Any): pp.Combine( - pp.Optional(pp.Word("-+")) + pp.Word(pp.nums) + pp.Optional(pp.Word("-+")) + _p_intnums ), (Number.Float, Sign.Positive): pp.Combine(pp.Optional("+") + _p_floatnums), (Number.Float, Sign.Negative): pp.Combine(pp.Literal("-") + _p_floatnums), @@ -101,6 +109,16 @@ def std_wordify(p): (Number.SciNot, Sign.Any): pp.Combine( pp.Optional(pp.Word("+-")) + _p_scinums ), + (Number.Decimal, Sign.Positive): pp.Combine(pp.Optional("+") + _p_decimalnums), + (Number.Decimal, Sign.Negative): pp.Combine(pp.Literal("-") + _p_decimalnums), + (Number.Decimal, Sign.Any): pp.Combine( + pp.Optional(pp.Word("+-")) + _p_decimalnums + ), + (Number.General, Sign.Positive): pp.Combine(pp.Optional("+") + _p_generalnums), + (Number.General, Sign.Negative): pp.Combine(pp.Literal("-") + _p_generalnums), + (Number.General, Sign.Any): pp.Combine( + pp.Optional(pp.Word("+-")) + _p_generalnums + ), } # pyparsing patterns from initial work. Definitely remove the .WordStart diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index b0cb391..db0c723 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -36,6 +36,23 @@ class TestPentCorePatterns(ut.TestCase): """Confirming basic pattern matching of the core pyparsing patterns.""" + @staticmethod + def parsetest(npat, s): + """Run an individual parse test on `s` using pattern `npat`.""" + try: + npat.parseString(s) + except pp.ParseException: + res = False + else: + res = True + + return res + + @staticmethod + def testname(v, n, s): + """Compose test name from a numerical value and pattern Number/Sign.""" + return "{0}_{1}_{2}".format(v, n, s) + def test_number_and_sign_matching(self): """Confirm number and sign patterns match the right string patterns.""" import pent @@ -43,20 +60,35 @@ def test_number_and_sign_matching(self): from .testdata import number_sign_vals as vals for (v, n, s) in itt.product(vals, pent.Number, pent.Sign): - testname = "{0}_{1}_{2}".format(v, n, s) - with self.subTest(testname): + with self.subTest(self.testname(v, n, s)): npat = pent.number_patterns[(n, s)] npat = pent.std_wordify(npat) - try: - npat.parseString(v) - except pp.ParseException: - res = False - else: - res = True + res = self.parsetest(npat, v) self.assertEqual(vals[v][(n, s)], res) + def test_raw_single_value_space_delimited(self): + """Confirm single-value parsing works with raw pyparsing patterns.""" + import pent + + from .testdata import number_sign_vals as vals + + test_line = "This line contains the value {} with space delimit." + #~ test_line = "This line contains the value -2e4 with space delimit." + + for v in vals: + test_str = test_line.format(v) + + for (n, s) in itt.product(pent.Number, pent.Sign): + with self.subTest(self.testname(v, n, s)): + npat = pent.number_patterns[(n, s)] + npat = pent.std_wordify(npat) + + res = self.parsetest(npat, test_str) + + self.assertEqual(vals[v][(n, s)], res, msg=test_str) + def suite_expect_good(): """Create and return the test suite for expect-good tests.""" diff --git a/requirements-dev.txt b/requirements-dev.txt index fb17f42..da82566 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,7 @@ attrs<18 sphinx==1.7.8 sphinx_rtd_theme==0.4.1 +sphinx-issues ipython flake8 flake8-docstrings From 775dafb3403a25ca6b476561582171d5888a1f81 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Sat, 8 Sep 2018 23:36:52 -0400 Subject: [PATCH 10/44] DEV: Convert from pyparsing to regex Relatively uncomplicated switchover, actually. --- pent/__init__.py | 1 + pent/patterns.py | 114 ++++++++--------------------------------- pent/test/pent_base.py | 19 +++---- 3 files changed, 30 insertions(+), 104 deletions(-) diff --git a/pent/__init__.py b/pent/__init__.py index 506117b..ca5a34c 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -38,4 +38,5 @@ from .enums import Number, Sign from .patterns import number_patterns, wordify_pattern, std_wordify + __version__ = "0.1dev1" diff --git a/pent/patterns.py b/pent/patterns.py index 940b42e..fc24b95 100644 --- a/pent/patterns.py +++ b/pent/patterns.py @@ -24,7 +24,8 @@ """ -import pyparsing as pp +import itertools as itt +import re from .enums import Number, Sign @@ -37,119 +38,48 @@ #: |str| with the standard numerical punctuation to include as not #: marking word boundaries. `de` is included to account for scientific #: notation. -std_num_punct = "+-." + std_scinot_markers +std_num_punct = std_scinot_markers + "+.-" def wordify_pattern(p, word_chars): """Wrap a pattern with word start/end markers using arbitrary word chars.""" - ws = pp.WordStart(word_chars) - we = pp.WordEnd(word_chars) - - return pp.Combine(ws + p + we) + return r"(? Date: Sun, 9 Sep 2018 00:22:45 -0400 Subject: [PATCH 11/44] DEV: Create Parser class, add simple test Autogeneration of composited regex seems to be working properly. Trick will now be to expand it in a robust and consistent way... --- pent/__init__.py | 2 ++ pent/parser.py | 67 ++++++++++++++++++++++++++++++++++++++++++ pent/patterns.py | 2 +- pent/test/pent_base.py | 48 +++++++++++++++++++++++++++--- 4 files changed, 114 insertions(+), 5 deletions(-) create mode 100644 pent/parser.py diff --git a/pent/__init__.py b/pent/__init__.py index ca5a34c..1c12421 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -28,6 +28,7 @@ from __future__ import absolute_import __all__ = [ + "Parser", "Number", "Sign", "number_patterns", @@ -36,6 +37,7 @@ ] from .enums import Number, Sign +from .parser import Parser from .patterns import number_patterns, wordify_pattern, std_wordify diff --git a/pent/parser.py b/pent/parser.py new file mode 100644 index 0000000..6e49990 --- /dev/null +++ b/pent/parser.py @@ -0,0 +1,67 @@ +r"""*Mini-language parser for* ``pent``. + +``pent`` Extracts Numerical Text. + +**Author** + Brian Skinn (bskinn@alum.mit.edu) + +**File Created** + 8 Sep 2018 + +**Copyright** + \(c) Brian Skinn 2018 + +**Source Repository** + http://www.github.com/bskinn/pent + +**Documentation** + http://pent.readthedocs.io + +**License** + The MIT License; see |license_txt|_ for full license terms + +**Members** + +""" + +import attr + +from .enums import Number, Sign +from .patterns import number_patterns, wordify_pattern, std_wordify + + +@attr.s +class Parser: + """Mini-language parser for structured numerical data.""" + + def convert_line(self, line): + """Implement dirt-oversimple line converter.""" + import shlex + + tokens = shlex.split(line) + + pattern = r"(^|(?<=\n))" + + for t in tokens: + if t == "*": + pattern += ".*?" + # ~ elif t.startswith('"') and t.endswith('"'): + # ~ pattern += t[1:-1] + elif t == "i": + pattern += std_wordify( + number_patterns[(Number.Integer, Sign.Positive)] + ) + elif t.startswith("!"): + for c in t[1:]: + if c in "[\^$.|?*+(){}": + pattern += "\\" + c + else: + pattern += c + else: + raise ValueError("BAD PATTERN, NEED CUSTOM ERRORS!") + + # Plus anything to the end of the line + # THIS APPROACH *MAY* END UP BEING PROBLEMATIC + pattern += r".*?($|(?=\n))" + + return pattern diff --git a/pent/patterns.py b/pent/patterns.py index fc24b95..bbef782 100644 --- a/pent/patterns.py +++ b/pent/patterns.py @@ -1,4 +1,4 @@ -r"""``pyparsing`` *patterns for* ``pent``. +r"""*Regex patterns for* ``pent``. ``pent`` Extracts Numerical Text. diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 706e3e5..6029496 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -34,8 +34,8 @@ import pyparsing as pp -class TestPentCorePatterns(ut.TestCase): - """Confirming basic pattern matching of the core pyparsing patterns.""" +class SuperPent: + """Superclass of various test classes, with common methods.""" @staticmethod def parsetest(npat, s): @@ -49,6 +49,10 @@ def make_testname(v, n, s): """Compose test name from a numerical value and pattern Number/Sign.""" return "{0}_{1}_{2}".format(v, n, s) + +class TestPentCorePatterns(ut.TestCase, SuperPent): + """Confirming basic pattern matching of the core regex patterns.""" + def test_number_and_sign_matching(self): """Confirm number and sign patterns match the right string patterns.""" import pent @@ -65,7 +69,7 @@ def test_number_and_sign_matching(self): self.assertEqual(vals[v][(n, s)], res, msg=npat) def test_raw_single_value_space_delimited(self): - """Confirm single-value parsing works with raw pyparsing patterns.""" + """Confirm single-value parsing from a line works with raw patterns.""" import pent from .testdata import number_sign_vals as vals @@ -85,11 +89,47 @@ def test_raw_single_value_space_delimited(self): self.assertEqual(vals[v][(n, s)], res, msg=test_str) +class TestPentParserPatterns(ut.TestCase, SuperPent): + """Confirming pattern matching of patterns generated by the Parser.""" + + def test_simple_parser(self): + """Confirm the dummy initial parser works as expected.""" + import pent + + from .testdata import number_sign_vals as vals + + test_line = "This line contains the value {} with space delimit." + test_pat = "* !contains * i" + + prs = pent.Parser() + + for v in vals: + test_str = test_line.format(v) + + for (n, s) in itt.product(pent.Number, pent.Sign): + + # Expand this to other Signs eventually!!! + if s is not pent.Sign.Positive or n is not pent.Number.Integer: + continue + + with self.subTest(self.make_testname(v, n, s)): + npat = prs.convert_line(test_pat) + + res = self.parsetest(npat, test_str) + + self.assertEqual(vals[v][(n, s)], res, msg=test_str) + + def suite_expect_good(): """Create and return the test suite for expect-good tests.""" s = ut.TestSuite() tl = ut.TestLoader() - s.addTests([tl.loadTestsFromTestCase(TestPentCorePatterns)]) + s.addTests( + [ + tl.loadTestsFromTestCase(TestPentCorePatterns), + tl.loadTestsFromTestCase(TestPentParserPatterns), + ] + ) return s From c1251d32a38baa4a237e3bad3db79355885d560a Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Sun, 9 Sep 2018 16:32:30 -0400 Subject: [PATCH 12/44] DEV/TEST: Implement further elements of the parser New Enums for the numeric token fields, and the number/string match styles (capture or not for strings; various for numbers) Revise Enum values to the single-character flags for those types of values. Begin implementation of pyparsing as parser for the mini-language itself, even though the underlying search is run via regex. Probably for the best, as the regex engine appears to be *SUBSTANTIALLY* faster than pyparsing. Makes sense, if pyparsing is pure Python. Parser test now is actually a real test of the syntax! (Or, well, PART of the syntax, anyways...) --- pent/__init__.py | 5 +- pent/enums.py | 58 ++++++++++++++--- pent/parser.py | 137 +++++++++++++++++++++++++++++++++++------ pent/patterns.py | 2 +- pent/test/pent_base.py | 14 ++--- 5 files changed, 179 insertions(+), 37 deletions(-) diff --git a/pent/__init__.py b/pent/__init__.py index 1c12421..425fcbf 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -31,12 +31,15 @@ "Parser", "Number", "Sign", + "TokenField", + "NumberMatchType", + "StringMatchType", "number_patterns", "wordify_pattern", "std_wordify", ] -from .enums import Number, Sign +from .enums import Number, Sign, TokenField, NumberMatchType, StringMatchType from .parser import Parser from .patterns import number_patterns, wordify_pattern, std_wordify diff --git a/pent/enums.py b/pent/enums.py index 3cd85dc..8cba1c3 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -31,29 +31,71 @@ class Number(Enum): """Enumeration for the different kinds of recognized number primitives.""" #: Integer value; no decimal or scientific/exponential notation - Integer = "int" + Integer = "i" #: Floating-point value; no scientific/exponential notation - Float = "float" + Float = "f" #: Scientific/exponential notation, where exponent is *required* - SciNot = "sci" + SciNot = "s" #: "Decimal" value; floating-point value with or without an exponent - Decimal = "dec" + Decimal = "d" #: "General" value; integer, float, or scientific notation - General = "gen" + General = "g" class Sign(Enum): """Enumeration for the different kinds of recognized numerical signs.""" #: Positive value only (leading '+' optional; includes zero) - Positive = "pos" + Positive = "+" #: Negative value only (leading '-' required; includes negative zero) - Negative = "neg" + Negative = "-" #: Any sign - Any = "any" + Any = "." + + +class StringMatchType(Enum): + """Enumeration for the various match types on literal string fields.""" + + #: Captured match + Capture = "=" + + #: Ignored match + Ignore = "!" + + +class NumberMatchType(Enum): + """Enumeration for the various match types on the numeric fields.""" + + #: Single value match + Single = "." + + #: Optional single value match + Optional = "?" + + #: One-or-more match + OneOrMore = "+" + + #: Zero-or-more match + ZeroOrMore = "*" + + #: Suppressed match + Suppress = "#" + + +class TokenField(Enum): + """Enumeration for fields within a mini-language number token.""" + + #: Type of number field (single value, one-or-more, zero-or-more, etc.) + Type = "type" + + #: Sign of acceptable values (any, positive, negative) + Sign = "sign" + + #: Format of the numerical value (int, float, scinot, decimal, general) + Number = "number" diff --git a/pent/parser.py b/pent/parser.py index 6e49990..b152480 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -25,43 +25,140 @@ """ import attr +import pyparsing as pp -from .enums import Number, Sign +from .enums import Number, Sign, TokenField +from .enums import NumberMatchType, StringMatchType from .patterns import number_patterns, wordify_pattern, std_wordify +# ## MINI-LANGUAGE PARSER DEFINITION ## + +# ## HELPERS ## +def _concat_values(e): + """Concatenate the values of the given Enum.""" + return "".join(_.value for _ in e) + + +_s_any_flag = "~" + +# ## ARBITRARY CONTENT ## +# Tilde says anything may be here, including multiple words +_pp_any_flag = pp.Literal(_s_any_flag) + +# ## LITERAL STRING ## +# Marker for the rest of the token to be a literal string +_pp_str_flag = pp.Word(_concat_values(StringMatchType), exact=1) + +# Remainder of the content after the marker, spaces included +_pp_str_content = pp.Word(pp.printables + " ") + +# Composite pattern for a literal string +_pp_string = _pp_str_flag + _pp_str_content + +# ## NUMERICAL VALUE ## +# Initial marker for a numerical value +_pp_num_flag = pp.Word(_concat_values(NumberMatchType), exact=1) + +# Marker for the sign of the value; period indicates either sign +_pp_num_sign = pp.Word(_concat_values(Sign), exact=1) + +# Marker for the number type to look for +_pp_num_type = pp.Word(_concat_values(Number), exact=1) + +# Composite pattern for a number +_pp_number = _pp_num_flag.setResultsName(TokenField.Type.value) + pp.Group( + _pp_num_sign.setResultsName(TokenField.Sign.value) + + _pp_num_type.setResultsName(TokenField.Number.value) +) + +# ## COMBINED TOKEN PARSER ## +_pp_token = _pp_any_flag ^ _pp_string ^ _pp_number + +# Will (presumably) eventually need to implement preceding/following +# literal strings on the number specifications + + +# ## PARSER CLASS FOR EXTERNAL USE ## + + @attr.s class Parser: """Mini-language parser for structured numerical data.""" - def convert_line(self, line): + @classmethod + def convert_line(cls, line, *, capture_groups=True): """Implement dirt-oversimple line converter.""" import shlex tokens = shlex.split(line) + # Zero-length start of line (or of entire string) match pattern = r"(^|(?<=\n))" - for t in tokens: - if t == "*": + group_id = 0 + + for i, t in enumerate(tokens): + # Optional whitespace before the first token; + # mandatory whitespace before all others + pattern += r"\s*" if i == 0 else r"\s+" + + # Will raise parse error here if bad token + pr = _pp_token.parseString(t) + + if pr[0] == _s_any_flag: pattern += ".*?" - # ~ elif t.startswith('"') and t.endswith('"'): - # ~ pattern += t[1:-1] - elif t == "i": - pattern += std_wordify( - number_patterns[(Number.Integer, Sign.Positive)] - ) - elif t.startswith("!"): - for c in t[1:]: - if c in "[\^$.|?*+(){}": - pattern += "\\" + c - else: - pattern += c - else: - raise ValueError("BAD PATTERN, NEED CUSTOM ERRORS!") - # Plus anything to the end of the line + elif pr[0] == StringMatchType.Ignore.value: + pattern += cls._string_pattern(pr[1]) + + elif pr[0] == StringMatchType.Capture.value: + if capture_groups: + group_id += 1 + pattern += cls._group_open(group_id) + + pattern += cls._string_pattern(pr[1]) + + if capture_groups: + pattern += cls._group_close() + + elif pr[0] == NumberMatchType.Suppress.value: + num_pat = cls._get_number_pattern(pr[1]) + pattern += std_wordify(num_pat) + + # Plus possible whitespace to the end of the line # THIS APPROACH *MAY* END UP BEING PROBLEMATIC - pattern += r".*?($|(?=\n))" + pattern += r"[ ]*($|(?=\n))" + + return pattern + + @staticmethod + def _get_number_pattern(parse_result): + """Return the correct number pattern given the parse result.""" + num = Number(parse_result[TokenField.Number.value]) + sign = Sign(parse_result[TokenField.Sign.value]) + + return number_patterns[num, sign] + + @staticmethod + def _group_open(group_id): + """Create the opening pattern for a named group.""" + return r"(?P<{}>".format(str(group_id)) + + @staticmethod + def _group_close(): + """Create the closing pattern for a named group.""" + return ")" + + @staticmethod + def _string_pattern(s): + """Create a literal string pattern from `s`.""" + pattern = "" + + for c in s: + if c in "[\^$.|?*+(){}": + pattern += "\\" + c + else: + pattern += c return pattern diff --git a/pent/patterns.py b/pent/patterns.py index bbef782..69d7b2f 100644 --- a/pent/patterns.py +++ b/pent/patterns.py @@ -38,7 +38,7 @@ #: |str| with the standard numerical punctuation to include as not #: marking word boundaries. `de` is included to account for scientific #: notation. -std_num_punct = std_scinot_markers + "+.-" +std_num_punct = std_scinot_markers + "+.-" # MUST have '-' at the end!! def wordify_pattern(p, word_chars): diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 6029496..325cfc9 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -92,14 +92,14 @@ def test_raw_single_value_space_delimited(self): class TestPentParserPatterns(ut.TestCase, SuperPent): """Confirming pattern matching of patterns generated by the Parser.""" - def test_simple_parser(self): - """Confirm the dummy initial parser works as expected.""" + def test_parser_single_line_space_delim(self): + """Confirm parser works on single lines with space-delimited values.""" import pent from .testdata import number_sign_vals as vals test_line = "This line contains the value {} with space delimit." - test_pat = "* !contains * i" + test_pat_template = "~ !contains ~ #{0}{1} ~" prs = pent.Parser() @@ -107,17 +107,17 @@ def test_simple_parser(self): test_str = test_line.format(v) for (n, s) in itt.product(pent.Number, pent.Sign): + test_pat = test_pat_template.format(s.value, n.value) - # Expand this to other Signs eventually!!! - if s is not pent.Sign.Positive or n is not pent.Number.Integer: - continue + # ~ if s is not pent.Sign.Any or n is not pent.Number.Integer: + # ~ continue with self.subTest(self.make_testname(v, n, s)): npat = prs.convert_line(test_pat) res = self.parsetest(npat, test_str) - self.assertEqual(vals[v][(n, s)], res, msg=test_str) + self.assertEqual(vals[v][n, s], res, msg=test_str) def suite_expect_good(): From 8ea57879ea42662ce58aa7c5077411bcc0f16b57 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Sun, 9 Sep 2018 17:04:10 -0400 Subject: [PATCH 13/44] ADMIN/TEST: Add .coveragerc; add/refine tests ADMIN: Definitely worthwhile to know, now, how the coverage looks (pretty good!) Add not-executable notices to all package modules. TEST: Add test for capturing and non-capturing string variations Add test for "~" pattern matching an arbitrary line. This would probably be a good thing to throw at hypothesis... --- .coveragerc | 13 ++++++++++++ pent/__init__.py | 3 ++- pent/enums.py | 4 ++++ pent/parser.py | 7 ++++++- pent/patterns.py | 2 +- pent/test/pent_base.py | 46 +++++++++++++++++++++++++++++++++--------- 6 files changed, 63 insertions(+), 12 deletions(-) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..9223b56 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,13 @@ +[run] +omit = + # Don't do coverage on test code + pent/test/* + tests.py + + # Don't cover code in the env + env/* + +[report] +exclude_lines = + pragma: no cover + ^\s*pass\s*$ diff --git a/pent/__init__.py b/pent/__init__.py index 425fcbf..dc7e576 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -37,10 +37,11 @@ "number_patterns", "wordify_pattern", "std_wordify", + "group_prefix", ] from .enums import Number, Sign, TokenField, NumberMatchType, StringMatchType -from .parser import Parser +from .parser import Parser, group_prefix from .patterns import number_patterns, wordify_pattern, std_wordify diff --git a/pent/enums.py b/pent/enums.py index 8cba1c3..e9a47e8 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -99,3 +99,7 @@ class TokenField(Enum): #: Format of the numerical value (int, float, scinot, decimal, general) Number = "number" + + +if __name__ == "__main__": # pragma: no cover + print("Module not executable.") diff --git a/pent/parser.py b/pent/parser.py index b152480..24336a4 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -39,6 +39,7 @@ def _concat_values(e): """Concatenate the values of the given Enum.""" return "".join(_.value for _ in e) +group_prefix = 'g' _s_any_flag = "~" @@ -143,7 +144,7 @@ def _get_number_pattern(parse_result): @staticmethod def _group_open(group_id): """Create the opening pattern for a named group.""" - return r"(?P<{}>".format(str(group_id)) + return r"(?P".format(str(group_id)) @staticmethod def _group_close(): @@ -162,3 +163,7 @@ def _string_pattern(s): pattern += c return pattern + + +if __name__ == "__main__": # pragma: no cover + print("Module not executable.") diff --git a/pent/patterns.py b/pent/patterns.py index 69d7b2f..67cd50a 100644 --- a/pent/patterns.py +++ b/pent/patterns.py @@ -82,5 +82,5 @@ def std_wordify(p): number_patterns.update({(n, s): _p_signs[s] + _p_nums[n]}) -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover print("Module not executable.") diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 325cfc9..5ad6a05 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -38,9 +38,9 @@ class SuperPent: """Superclass of various test classes, with common methods.""" @staticmethod - def parsetest(npat, s): - """Run an individual parse test on `s` using pattern `npat`.""" - m = re.search(npat, s) + def parsetest(re_pat, s): + """Run an individual parse test on `s` using regex pattern `re_pat`.""" + m = re.search(re_pat, s) return m is not None @@ -92,6 +92,9 @@ def test_raw_single_value_space_delimited(self): class TestPentParserPatterns(ut.TestCase, SuperPent): """Confirming pattern matching of patterns generated by the Parser.""" + import pent + prs = pent.Parser() + def test_parser_single_line_space_delim(self): """Confirm parser works on single lines with space-delimited values.""" import pent @@ -101,24 +104,49 @@ def test_parser_single_line_space_delim(self): test_line = "This line contains the value {} with space delimit." test_pat_template = "~ !contains ~ #{0}{1} ~" - prs = pent.Parser() - for v in vals: test_str = test_line.format(v) for (n, s) in itt.product(pent.Number, pent.Sign): test_pat = test_pat_template.format(s.value, n.value) - # ~ if s is not pent.Sign.Any or n is not pent.Number.Integer: - # ~ continue - with self.subTest(self.make_testname(v, n, s)): - npat = prs.convert_line(test_pat) + npat = self.prs.convert_line(test_pat) res = self.parsetest(npat, test_str) self.assertEqual(vals[v][n, s], res, msg=test_str) + def test_string_capture(self): + """Confirm string capture works when desired; is ignored when not.""" + import pent + + test_line = "This is a string with a word in it." + test_pat_capture = "~ =word ~" + test_pat_ignore = "~ !word ~" + + with self.subTest('capture'): + pat = self.prs.convert_line(test_pat_capture) + m = re.search(pat, test_line) + self.assertIsNotNone(m) + self.assertEqual(m.group(pent.group_prefix + '1'), 'word') + + with self.subTest('ignore'): + pat = self.prs.convert_line(test_pat_ignore) + m = re.search(pat, test_line) + self.assertIsNotNone(m) + self.assertRaises(IndexError, m.group, pent.group_prefix + '1') + + def test_match_entire_line(self): + """Confirm the tilde works to match an entire line.""" + import pent + + test_line = "This is a line that shouldn't matter what's in it" + test_pat = "~" + + pat = self.prs.convert_line(test_pat) + self.assertTrue(self.parsetest(pat, test_line)) + def suite_expect_good(): """Create and return the test suite for expect-good tests.""" From 4cdfbbb06bf493e824b9513753abdcf877e8a5db Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Sun, 9 Sep 2018 23:59:46 -0400 Subject: [PATCH 14/44] DEV/TEST: Start attempting no-space gap; add tests Various tests for captures, non-captures, etc. added. Started trying to implement the no-space-gap feature. The original idea of just not inserting the preceding space pattern bit is not going to work, because it runs afoul of both the preceding and the 'x'-marked tokens being wordified. This may require a complete first pass through all the tokens to identify any that are no-spaced, followed by a second pass to actually assemble the patterns, with wordification applied appropriately to no-space groups. --- pent/enums.py | 6 ++ pent/parser.py | 81 +++++++++++++++++------ pent/patterns.py | 3 +- pent/test/pent_base.py | 147 ++++++++++++++++++++++++++++++++++++----- 4 files changed, 199 insertions(+), 38 deletions(-) diff --git a/pent/enums.py b/pent/enums.py index e9a47e8..8946ead 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -100,6 +100,12 @@ class TokenField(Enum): #: Format of the numerical value (int, float, scinot, decimal, general) Number = "number" + #: Combined sign and number, for initial pattern group retrieval + SignNumber = "sign_number" + + #: Flag to suppress preceding space in the generated pattern + NoSpace = "no_space" + if __name__ == "__main__": # pragma: no cover print("Module not executable.") diff --git a/pent/parser.py b/pent/parser.py index 24336a4..e98b8f2 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -29,7 +29,7 @@ from .enums import Number, Sign, TokenField from .enums import NumberMatchType, StringMatchType -from .patterns import number_patterns, wordify_pattern, std_wordify +from .patterns import number_patterns, std_wordify # ## MINI-LANGUAGE PARSER DEFINITION ## @@ -39,10 +39,13 @@ def _concat_values(e): """Concatenate the values of the given Enum.""" return "".join(_.value for _ in e) -group_prefix = 'g' + +group_prefix = "g" _s_any_flag = "~" +_s_num_no_space = "x" + # ## ARBITRARY CONTENT ## # Tilde says anything may be here, including multiple words _pp_any_flag = pp.Literal(_s_any_flag) @@ -68,9 +71,16 @@ def _concat_values(e): _pp_num_type = pp.Word(_concat_values(Number), exact=1) # Composite pattern for a number -_pp_number = _pp_num_flag.setResultsName(TokenField.Type.value) + pp.Group( - _pp_num_sign.setResultsName(TokenField.Sign.value) - + _pp_num_type.setResultsName(TokenField.Number.value) +_pp_number = ( + _pp_num_flag.setResultsName(TokenField.Type.value) + + pp.Group( + _pp_num_sign.setResultsName(TokenField.Sign.value) + + _pp_num_type.setResultsName(TokenField.Number.value) + ).setResultsName(TokenField.SignNumber.value) + + pp.Optional(pp.Literal(_s_num_no_space)).setResultsName( + TokenField.NoSpace.value + ) + + pp.WordEnd() ) # ## COMBINED TOKEN PARSER ## @@ -102,34 +112,46 @@ def convert_line(cls, line, *, capture_groups=True): for i, t in enumerate(tokens): # Optional whitespace before the first token; # mandatory whitespace before all others - pattern += r"\s*" if i == 0 else r"\s+" + pre_space = r"\s*" if i == 0 else r"\s+" # Will raise parse error here if bad token pr = _pp_token.parseString(t) if pr[0] == _s_any_flag: - pattern += ".*?" + pattern += pre_space + ".*?" elif pr[0] == StringMatchType.Ignore.value: - pattern += cls._string_pattern(pr[1]) + pattern += pre_space + cls._string_pattern(pr[1]) elif pr[0] == StringMatchType.Capture.value: - if capture_groups: - group_id += 1 - pattern += cls._group_open(group_id) - - pattern += cls._string_pattern(pr[1]) - - if capture_groups: - pattern += cls._group_close() + subpat, group_id = cls._group_enclose( + cls._string_pattern(pr[1]), + group_id, + do_enclose=capture_groups, + ) + pattern += pre_space + subpat elif pr[0] == NumberMatchType.Suppress.value: - num_pat = cls._get_number_pattern(pr[1]) - pattern += std_wordify(num_pat) + # THE 'NO-SPACE BEFORE' FEATURE IS GOING TO BE COMPLEX, SINCE + # IT WON'T WORK TO WORDIFY THE PATTERNS FROM EACH TOKEN + # BECAUSE THERE'S NO ACTUAL WORD BREAK WHEN THERE'S NO + # PRECEDING SPACE + pattern += "" if TokenField.NoSpace.value in pr else pre_space + pattern += std_wordify( + cls._get_number_pattern(pr[TokenField.SignNumber.value]) + ) + + elif pr[0] == NumberMatchType.Single.value: + subpat, group_id = cls._group_enclose( + cls._get_number_pattern(pr[1]), + group_id, + do_enclose=capture_groups, + ) + pattern += pre_space + std_wordify(subpat) # Plus possible whitespace to the end of the line # THIS APPROACH *MAY* END UP BEING PROBLEMATIC - pattern += r"[ ]*($|(?=\n))" + pattern += r"[ \t]*($|(?=\n))" return pattern @@ -141,10 +163,29 @@ def _get_number_pattern(parse_result): return number_patterns[num, sign] + @classmethod + def _group_enclose(cls, pat, group_id, *, do_enclose=True): + """Enclose the pattern in the group, if told to do so. + + Returns the pattern, modified or not, and the updated group_id. + + """ + outpat = "" + if do_enclose: + group_id += 1 + outpat += cls._group_open(group_id) + + outpat += pat + + if do_enclose: + outpat += cls._group_close() + + return outpat, group_id + @staticmethod def _group_open(group_id): """Create the opening pattern for a named group.""" - return r"(?P".format(str(group_id)) + return r"(?P<{0}{1}>".format(group_prefix, str(group_id)) @staticmethod def _group_close(): diff --git a/pent/patterns.py b/pent/patterns.py index 67cd50a..8c2b305 100644 --- a/pent/patterns.py +++ b/pent/patterns.py @@ -25,7 +25,6 @@ """ import itertools as itt -import re from .enums import Number, Sign @@ -42,7 +41,7 @@ def wordify_pattern(p, word_chars): - """Wrap a pattern with word start/end markers using arbitrary word chars.""" + """Wrap pattern with word start/end markers using arbitrary word chars.""" return r"(? Date: Mon, 10 Sep 2018 23:19:22 -0400 Subject: [PATCH 15/44] REFACTOR: Implement Token parsing class The mini-language token parsing is now encapsulated in Token, in parser.py. This should make for a much cleaner scan over the mini-language content within Parser, and should allow for much more flexible adaptation/augmentation/modification of the regex patterns generated out of each token/Token. Add PentError and BadTokenError custom exceptions. Add Enum for 'any' match types. Will be useful if/when a 'capturing-any' variant is implemented, but was mainly added to allow use of the 'parser._has_value' helper function. --- pent/__init__.py | 10 +- pent/enums.py | 7 ++ pent/errors.py | 42 ++++++++ pent/parser.py | 219 +++++++++++++++++++++++++++-------------- pent/test/pent_base.py | 3 +- 5 files changed, 204 insertions(+), 77 deletions(-) create mode 100644 pent/errors.py diff --git a/pent/__init__.py b/pent/__init__.py index dc7e576..7b8ce4f 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -29,19 +29,25 @@ __all__ = [ "Parser", + "Token", "Number", "Sign", "TokenField", + "AnyMatchType", "NumberMatchType", "StringMatchType", "number_patterns", "wordify_pattern", "std_wordify", "group_prefix", + "PentError", + "BadTokenError", ] -from .enums import Number, Sign, TokenField, NumberMatchType, StringMatchType -from .parser import Parser, group_prefix +from .enums import Number, Sign, TokenField +from .enums import AnyMatchType, NumberMatchType, StringMatchType +from .errors import PentError, BadTokenError +from .parser import Parser, Token, group_prefix from .patterns import number_patterns, wordify_pattern, std_wordify diff --git a/pent/enums.py b/pent/enums.py index 8946ead..adb87ac 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -59,6 +59,13 @@ class Sign(Enum): Any = "." +class AnyMatchType(Enum): + """Enumeration for various 'any' match types.""" + + #: Non-captured match + Ignore = "~" + + class StringMatchType(Enum): """Enumeration for the various match types on literal string fields.""" diff --git a/pent/errors.py b/pent/errors.py new file mode 100644 index 0000000..bbb309e --- /dev/null +++ b/pent/errors.py @@ -0,0 +1,42 @@ +r"""*Custom exceptions for* ``pent``. + +``pent`` Extracts Numerical Text. + +**Author** + Brian Skinn (bskinn@alum.mit.edu) + +**File Created** + 10 Sep 2018 + +**Copyright** + \(c) Brian Skinn 2018 + +**Source Repository** + http://www.github.com/bskinn/pent + +**Documentation** + http://pent.readthedocs.io + +**License** + The MIT License; see |license_txt|_ for full license terms + +**Members** + +""" + +class PentError(Exception): + pass + + +class BadTokenError(PentError): + """Raised during attempts to parse an invalid token.""" + + def __init__(self, token): + self.token = token + + def __str__(self): + return "'{}' is an invalid pent token".format(self.token) + + +if __name__ == "__main__": # pragma: no cover + print("Module not executable.") diff --git a/pent/parser.py b/pent/parser.py index e98b8f2..7ef1c17 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -28,7 +28,8 @@ import pyparsing as pp from .enums import Number, Sign, TokenField -from .enums import NumberMatchType, StringMatchType +from .enums import NumberMatchType, StringMatchType, AnyMatchType +from .errors import BadTokenError from .patterns import number_patterns, std_wordify @@ -40,6 +41,11 @@ def _concat_values(e): return "".join(_.value for _ in e) +def _has_value(e, v): + """Check if Enum 'e' has value 'v'.""" + return v in (_.value for _ in e) + + group_prefix = "g" _s_any_flag = "~" @@ -102,7 +108,9 @@ def convert_line(cls, line, *, capture_groups=True): """Implement dirt-oversimple line converter.""" import shlex + # Parse line into tokens, and then into Tokens tokens = shlex.split(line) + tokens = list(Token(_, capture=capture_groups) for _ in tokens) # Zero-length start of line (or of entire string) match pattern = r"(^|(?<=\n))" @@ -110,44 +118,15 @@ def convert_line(cls, line, *, capture_groups=True): group_id = 0 for i, t in enumerate(tokens): - # Optional whitespace before the first token; - # mandatory whitespace before all others - pre_space = r"\s*" if i == 0 else r"\s+" - - # Will raise parse error here if bad token - pr = _pp_token.parseString(t) - - if pr[0] == _s_any_flag: - pattern += pre_space + ".*?" - - elif pr[0] == StringMatchType.Ignore.value: - pattern += pre_space + cls._string_pattern(pr[1]) - - elif pr[0] == StringMatchType.Capture.value: - subpat, group_id = cls._group_enclose( - cls._string_pattern(pr[1]), - group_id, - do_enclose=capture_groups, - ) - pattern += pre_space + subpat - - elif pr[0] == NumberMatchType.Suppress.value: - # THE 'NO-SPACE BEFORE' FEATURE IS GOING TO BE COMPLEX, SINCE - # IT WON'T WORK TO WORDIFY THE PATTERNS FROM EACH TOKEN - # BECAUSE THERE'S NO ACTUAL WORD BREAK WHEN THERE'S NO - # PRECEDING SPACE - pattern += "" if TokenField.NoSpace.value in pr else pre_space - pattern += std_wordify( - cls._get_number_pattern(pr[TokenField.SignNumber.value]) - ) - - elif pr[0] == NumberMatchType.Single.value: - subpat, group_id = cls._group_enclose( - cls._get_number_pattern(pr[1]), - group_id, - do_enclose=capture_groups, - ) - pattern += pre_space + std_wordify(subpat) + # IGNORING SPACE_AFTER FOR NOW + pattern += r"\s*" if i == 0 else r"\s+" + + tok_pattern = t.pattern + if t.needs_group_id: + group_id += 1 + tok_pattern = tok_pattern.format(str(group_id)) + + pattern += tok_pattern # Plus possible whitespace to the end of the line # THIS APPROACH *MAY* END UP BEING PROBLEMATIC @@ -155,42 +134,105 @@ def convert_line(cls, line, *, capture_groups=True): return pattern - @staticmethod - def _get_number_pattern(parse_result): - """Return the correct number pattern given the parse result.""" - num = Number(parse_result[TokenField.Number.value]) - sign = Sign(parse_result[TokenField.Sign.value]) - - return number_patterns[num, sign] - - @classmethod - def _group_enclose(cls, pat, group_id, *, do_enclose=True): - """Enclose the pattern in the group, if told to do so. - - Returns the pattern, modified or not, and the updated group_id. - """ - outpat = "" - if do_enclose: - group_id += 1 - outpat += cls._group_open(group_id) - - outpat += pat - - if do_enclose: - outpat += cls._group_close() - - return outpat, group_id - - @staticmethod - def _group_open(group_id): - """Create the opening pattern for a named group.""" - return r"(?P<{0}{1}>".format(group_prefix, str(group_id)) - - @staticmethod - def _group_close(): - """Create the closing pattern for a named group.""" - return ")" +@attr.s +class Token: + """Encapsulates transforming mini-language patterns tokens into regex.""" + + from .patterns import number_patterns as _numpats + + #: Mini-language token string to be parsed + token = attr.ib() + + #: Whether group captures should be added or not + capture = attr.ib(default=True) + + #: Flag for whether group ID substitution needs to be done + needs_group_id = attr.ib(default=False, init=False, repr=False) + + #: Compiled regex pattern from the token + @property + def pattern(self): + return self._pattern + + #: Flag for whether the token is an "any content" token + @property + def is_any(self): + return _has_value(AnyMatchType, self._pr[0]) + + #: Flag for whether the token matches a literal string + @property + def is_str(self): + return _has_value(StringMatchType, self._pr[0]) + + #: Flag for whether the token matches a number + @property + def is_num(self): + return _has_value(NumberMatchType, self._pr[0]) + + #: String matching type; |None| if token doesn't match a string + @property + def str_match_type(self): + if self.is_str: + return StringMatchType(self._pr[0]) + else: + return None + + #: Number matching type; |None| if token doesn't match a number + @property + def num_match_type(self): + if self.is_num: + return NumberMatchType(self._pr[0]) + else: + return None + + #: Number format matched; |None| if token doesn't match a number + @property + def number(self): + if self.is_num: + return Number(self._pr[TokenField.SignNumber.value][TokenField.Number.value]) + else: + return None + + #: Number sign matched; |None| if token doesn't match a number + @property + def sign(self): + if self.is_num: + return Sign(self._pr[TokenField.SignNumber.value][TokenField.Sign.value]) + else: + return None + + #: Flag for whether space should be provided for after the match + @property + def space_after(self): + if self.is_num: + return not TokenField.NoSpace.value in self._pr + else: + return None + + def __attrs_post_init__(self): + """Handle automatic creation stuff.""" + try: + self._pr = _pp_token.parseString(self.token) + except pp.ParseException as e: + raise BadTokenError(self.token) from e + + if self.is_any: + self._pattern = '.*?' + + elif self.is_str: + self._pattern = self._string_pattern(self._pr[1]) + + if self.capture and self._pr[0] == StringMatchType.Capture.value: + self.needs_group_id = True + self._pattern = self._group_enclose(self._pattern) + + elif self.is_num: + self._pattern = self._get_number_pattern(self._pr) + + if self.capture and self._pr[0] == NumberMatchType.Single.value: + self.needs_group_id = True + self._pattern = self._group_enclose(self._pattern) @staticmethod def _string_pattern(s): @@ -199,12 +241,41 @@ def _string_pattern(s): for c in s: if c in "[\^$.|?*+(){}": + # Must escape regex special characters pattern += "\\" + c else: pattern += c return pattern + @classmethod + def _get_number_pattern(cls, parse_result): + """Return the correct number pattern given the parse result.""" + num = Number(parse_result[TokenField.SignNumber.value][TokenField.Number.value]) + sign = Sign(parse_result[TokenField.SignNumber.value][TokenField.Sign.value]) + + return cls._numpats[num, sign] + + @staticmethod + def _group_open(): + """Create the opening pattern for a named group. + + This leaves a formatting placeholder for the invoking Parser + to inject the appropriate group ID. + + """ + return r"(?P<{0}{{0}}>".format(group_prefix) + + @staticmethod + def _group_close(): + """Create the closing pattern for a named group.""" + return ")" + + @classmethod + def _group_enclose(cls, pat): + """Enclose the pattern in the group enclosure.""" + return cls._group_open() + pat + cls._group_close() + if __name__ == "__main__": # pragma: no cover print("Module not executable.") diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 1a71646..82765e1 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -174,6 +174,7 @@ def test_single_num_capture(self): if m is not None: self.assertEqual(m.group(pent.group_prefix + "1"), v) + @ut.skip("Awaiting refactor and implement") def test_single_nums_no_space(self): """Confirm two-number capture works, with no intervening space. @@ -186,7 +187,7 @@ def test_single_nums_no_space(self): from .testdata import number_sign_vals as vals test_str = "This is a string with 123-456 in it." - test_pat = "~ ..i #-ix ~" + test_pat = "~ ..ix #-i ~" npat = self.prs.convert_line(test_pat) From 63898ed4a54177e2f6e6a245566e6f602dfd9a78 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Mon, 10 Sep 2018 23:29:41 -0400 Subject: [PATCH 16/44] ADMIN: Enable Travis --- .travis.yml | 16 ++++++++++++++++ requirements-travis.txt | 4 ++++ 2 files changed, 20 insertions(+) create mode 100644 .travis.yml create mode 100644 requirements-travis.txt diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..f754843 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,16 @@ +install: + - pip install -r requirements-travis.txt +# - pip install -e . +# - sh -c 'cd doc; make html; mkdir scratch' +language: python +python: + - 3.4 + - 3.5 + - 3.6 + - 3.7-dev +script: + - coverage run tests.py -a +# - flake8 sphobjinv +# - echo $TRAVIS_PYTHON_VERSION | grep -e '^3\.6' && sh -c 'cd doc; make doctest' || echo 'No doctest.' + - echo $TRAVIS_PYTHON_VERSION | grep -e '^3\.6' && codecov || echo "No codecov." + diff --git a/requirements-travis.txt b/requirements-travis.txt new file mode 100644 index 0000000..275ad96 --- /dev/null +++ b/requirements-travis.txt @@ -0,0 +1,4 @@ +attrs<18 +pyparsing +coverage +codecov From dd00d523c56b1784346c637575f2f0d6070993b0 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Tue, 11 Sep 2018 00:07:56 -0400 Subject: [PATCH 17/44] ADMIN: Update version, setup.py; setup tox Identified earliest acceptable version of pyparsing, 1.5.5. Just went with py34 and attrs 17.1 as earliest there. Not bothering with py33. Add setup.py to blackened files. Canonicalize the '.dev1' version suffix, per setup.py check --- .travis.yml | 2 +- black | 2 +- black.bat | 2 +- pent/__init__.py | 2 +- pent/errors.py | 5 +++-- pent/parser.py | 18 +++++++++++++----- setup.py | 17 ++++++++++++----- tox.ini | 44 ++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 76 insertions(+), 16 deletions(-) create mode 100644 tox.ini diff --git a/.travis.yml b/.travis.yml index f754843..63ce14c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ python: - 3.7-dev script: - coverage run tests.py -a -# - flake8 sphobjinv +# - flake8 pent # - echo $TRAVIS_PYTHON_VERSION | grep -e '^3\.6' && sh -c 'cd doc; make doctest' || echo 'No doctest.' - echo $TRAVIS_PYTHON_VERSION | grep -e '^3\.6' && codecov || echo "No codecov." diff --git a/black b/black index c38582f..c853019 100755 --- a/black +++ b/black @@ -1,4 +1,4 @@ #! /bin/bash -black tests.py pent pent/test -l 79 +black setup.py tests.py pent pent/test -l 79 diff --git a/black.bat b/black.bat index 637bc4a..3ec2e65 100644 --- a/black.bat +++ b/black.bat @@ -1,4 +1,4 @@ @echo off -black.exe tests.py pent pent\test -l 79 +black.exe setup.py tests.py pent pent\test -l 79 diff --git a/pent/__init__.py b/pent/__init__.py index 7b8ce4f..b9e5139 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -51,4 +51,4 @@ from .patterns import number_patterns, wordify_pattern, std_wordify -__version__ = "0.1dev1" +__version__ = "0.1.dev1" diff --git a/pent/errors.py b/pent/errors.py index bbb309e..c0c6367 100644 --- a/pent/errors.py +++ b/pent/errors.py @@ -24,11 +24,12 @@ """ -class PentError(Exception): + +class PentError(Exception): # pragma: no cover pass -class BadTokenError(PentError): +class BadTokenError(PentError): # pragma: no cover """Raised during attempts to parse an invalid token.""" def __init__(self, token): diff --git a/pent/parser.py b/pent/parser.py index 7ef1c17..3ac3258 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -190,7 +190,9 @@ def num_match_type(self): @property def number(self): if self.is_num: - return Number(self._pr[TokenField.SignNumber.value][TokenField.Number.value]) + return Number( + self._pr[TokenField.SignNumber.value][TokenField.Number.value] + ) else: return None @@ -198,7 +200,9 @@ def number(self): @property def sign(self): if self.is_num: - return Sign(self._pr[TokenField.SignNumber.value][TokenField.Sign.value]) + return Sign( + self._pr[TokenField.SignNumber.value][TokenField.Sign.value] + ) else: return None @@ -218,7 +222,7 @@ def __attrs_post_init__(self): raise BadTokenError(self.token) from e if self.is_any: - self._pattern = '.*?' + self._pattern = ".*?" elif self.is_str: self._pattern = self._string_pattern(self._pr[1]) @@ -251,8 +255,12 @@ def _string_pattern(s): @classmethod def _get_number_pattern(cls, parse_result): """Return the correct number pattern given the parse result.""" - num = Number(parse_result[TokenField.SignNumber.value][TokenField.Number.value]) - sign = Sign(parse_result[TokenField.SignNumber.value][TokenField.Sign.value]) + num = Number( + parse_result[TokenField.SignNumber.value][TokenField.Number.value] + ) + sign = Sign( + parse_result[TokenField.SignNumber.value][TokenField.Sign.value] + ) return cls._numpats[num, sign] diff --git a/setup.py b/setup.py index 0f04050..e438e49 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,7 @@ from setuptools import setup +from pent import __version__ + def readme(): with open("README.rst", "r") as f: @@ -8,14 +10,18 @@ def readme(): setup( name="pent", - version="0.0", - packages=["pent"], + version=__version__, + description="Pent Extracts Numerical Text", + long_description=readme(), url="https://www.github.com/bskinn/pent", license="MIT License", author="Brian Skinn", author_email="bskinn@alum.mit.edu", - description="Pent Extracts Numerical Text", - long_description=readme(), + packages=["pent"], + provides=["pent"], + python_requires=">=3.4", + requires=["attrs (>=17.1)", "pyparsing (>=1.5.5)"], + install_requires=["attrs>=17.1", "pyparsing>=1.5.5"], classifiers=[ "License :: OSI Approved :: MIT License", "Natural Language :: English", @@ -25,9 +31,10 @@ def readme(): "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Utilities", - "Development Status :: 1 - Planning", + "Development Status :: 2 - Pre-Alpha", ], ) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..2b48f8b --- /dev/null +++ b/tox.ini @@ -0,0 +1,44 @@ +[tox] +minversion=2.0 +envlist= + py36-attrs_17_4-pp_{1_5_5,2_0_0,2_2_0,latest} + py36-attrs_{17_1,17_4,latest}-pp_2_2_0 + py3{4,5,6,7}-attrs_17_4-pp_2_2_0 + py3{4,6}-attrs_17_1-pp_1_5_5 + +[testenv] +whitelist_externals=/bin/sh +commands= + python --version +# sh -c 'cd doc; make html' + python tests.py -a +# py3{5,6,7}: sh -c 'cd doc; make doctest' + +deps= + attrs_17_1: attrs==17.1 + attrs_17_3: attrs==17.3 + attrs_17_4: attrs==17.4 + attrs_latest: attrs + pp_2_2_0: pyparsing==2.2.0 + pp_2_0_0: pyparsing==2.0.0 + pp_1_5_5: pyparsing==1.5.5 + pp_latest: pyparsing + sphinx + sphinx-issues + sphinx-rtd-theme + +[testenv:win] +platform=win +basepython= + py36: C:\python36\python.exe + py35: C:\python35\python.exe + py34: C:\python34\python.exe + +[testenv:linux] +platform=linux +basepython= + py37: python3.7 + py36: python3.6 + py35: python3.5 + py34: python3.4 + From 7d54d4eeab4fb2cfffb9349fb17ff1b4752591ac Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Tue, 11 Sep 2018 15:56:22 -0400 Subject: [PATCH 18/44] DEV: Implement no-space-after numtoken Quite the hinky-jinks to get the word boundaries in the right place when no-space-after numerical tokens are present. Basic functionality is in place, though, so closes #5. Still needs expanded testing, though. Probably would be worth bringing in hypothesis.... --- pent/parser.py | 41 +++++++++++++++++++++++++++++++++-------- pent/patterns.py | 28 ++++++++++++++++++++++++++-- pent/test/pent_base.py | 3 +-- 3 files changed, 60 insertions(+), 12 deletions(-) diff --git a/pent/parser.py b/pent/parser.py index 3ac3258..d9004f2 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -30,7 +30,7 @@ from .enums import Number, Sign, TokenField from .enums import NumberMatchType, StringMatchType, AnyMatchType from .errors import BadTokenError -from .patterns import number_patterns, std_wordify +from .patterns import number_patterns, std_wordify_open, std_wordify_close # ## MINI-LANGUAGE PARSER DEFINITION ## @@ -115,21 +115,44 @@ def convert_line(cls, line, *, capture_groups=True): # Zero-length start of line (or of entire string) match pattern = r"(^|(?<=\n))" + # Always have optional starting whitespace + pattern += r"[ \t]*" + + # Must initialize group_id = 0 - for i, t in enumerate(tokens): - # IGNORING SPACE_AFTER FOR NOW - pattern += r"\s*" if i == 0 else r"\s+" + # Initialize flag for a preceding no-space-after num token + prior_no_space_token = False + for i, t in enumerate(tokens): tok_pattern = t.pattern if t.needs_group_id: group_id += 1 tok_pattern = tok_pattern.format(str(group_id)) - pattern += tok_pattern + if t.is_num: + if not prior_no_space_token: + tok_pattern = std_wordify_open(tok_pattern) + + if t.space_after: + tok_pattern = std_wordify_close(tok_pattern) + prior_no_space_token = False + else: + prior_no_space_token = True + + pattern += tok_pattern - # Plus possible whitespace to the end of the line - # THIS APPROACH *MAY* END UP BEING PROBLEMATIC + else: + pattern += tok_pattern + prior_no_space_token = False + + # Add required space or no space, depending on + # what the token calls for, as long as it's not + # the last token + if i < len(tokens) - 1 and t.space_after: + pattern += r"[ \t]+" + + # Always put possible whitespace to the end of the line pattern += r"[ \t]*($|(?=\n))" return pattern @@ -211,8 +234,10 @@ def sign(self): def space_after(self): if self.is_num: return not TokenField.NoSpace.value in self._pr + elif self.is_str: + return True else: - return None + return False def __attrs_post_init__(self): """Handle automatic creation stuff.""" diff --git a/pent/patterns.py b/pent/patterns.py index 8c2b305..03f774c 100644 --- a/pent/patterns.py +++ b/pent/patterns.py @@ -40,14 +40,38 @@ std_num_punct = std_scinot_markers + "+.-" # MUST have '-' at the end!! +#: Standard word marker characters for pent +std_word_chars = "a-zA-Z0-9" + std_num_punct + + +def wordify_open(p, word_chars): + """Prepend the word start markers.""" + return r"(? Date: Tue, 11 Sep 2018 17:48:48 -0400 Subject: [PATCH 19/44] REFACTOR: Reimplement Enums as str hybrids. BRILLIANT. So many things that needed to be done with these values are DRAMATICALLY easier with the string mix-in. Closes #13, and HOW. --- pent/enums.py | 12 +++++----- pent/parser.py | 61 +++++++++++++++++--------------------------------- 2 files changed, 27 insertions(+), 46 deletions(-) diff --git a/pent/enums.py b/pent/enums.py index adb87ac..184b9ba 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -27,7 +27,7 @@ from enum import Enum -class Number(Enum): +class Number(str, Enum): """Enumeration for the different kinds of recognized number primitives.""" #: Integer value; no decimal or scientific/exponential notation @@ -46,7 +46,7 @@ class Number(Enum): General = "g" -class Sign(Enum): +class Sign(str, Enum): """Enumeration for the different kinds of recognized numerical signs.""" #: Positive value only (leading '+' optional; includes zero) @@ -59,14 +59,14 @@ class Sign(Enum): Any = "." -class AnyMatchType(Enum): +class AnyMatchType(str, Enum): """Enumeration for various 'any' match types.""" #: Non-captured match Ignore = "~" -class StringMatchType(Enum): +class StringMatchType(str, Enum): """Enumeration for the various match types on literal string fields.""" #: Captured match @@ -76,7 +76,7 @@ class StringMatchType(Enum): Ignore = "!" -class NumberMatchType(Enum): +class NumberMatchType(str, Enum): """Enumeration for the various match types on the numeric fields.""" #: Single value match @@ -95,7 +95,7 @@ class NumberMatchType(Enum): Suppress = "#" -class TokenField(Enum): +class TokenField(str, Enum): """Enumeration for fields within a mini-language number token.""" #: Type of number field (single value, one-or-more, zero-or-more, etc.) diff --git a/pent/parser.py b/pent/parser.py index d9004f2..a6aeb92 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -30,35 +30,24 @@ from .enums import Number, Sign, TokenField from .enums import NumberMatchType, StringMatchType, AnyMatchType from .errors import BadTokenError -from .patterns import number_patterns, std_wordify_open, std_wordify_close +from .patterns import std_wordify_open, std_wordify_close # ## MINI-LANGUAGE PARSER DEFINITION ## # ## HELPERS ## -def _concat_values(e): - """Concatenate the values of the given Enum.""" - return "".join(_.value for _ in e) - - -def _has_value(e, v): - """Check if Enum 'e' has value 'v'.""" - return v in (_.value for _ in e) - - group_prefix = "g" - _s_any_flag = "~" - _s_num_no_space = "x" + # ## ARBITRARY CONTENT ## # Tilde says anything may be here, including multiple words _pp_any_flag = pp.Literal(_s_any_flag) # ## LITERAL STRING ## # Marker for the rest of the token to be a literal string -_pp_str_flag = pp.Word(_concat_values(StringMatchType), exact=1) +_pp_str_flag = pp.Word("".join(StringMatchType), exact=1) # Remainder of the content after the marker, spaces included _pp_str_content = pp.Word(pp.printables + " ") @@ -68,23 +57,23 @@ def _has_value(e, v): # ## NUMERICAL VALUE ## # Initial marker for a numerical value -_pp_num_flag = pp.Word(_concat_values(NumberMatchType), exact=1) +_pp_num_flag = pp.Word("".join(NumberMatchType), exact=1) # Marker for the sign of the value; period indicates either sign -_pp_num_sign = pp.Word(_concat_values(Sign), exact=1) +_pp_num_sign = pp.Word("".join(Sign), exact=1) # Marker for the number type to look for -_pp_num_type = pp.Word(_concat_values(Number), exact=1) +_pp_num_type = pp.Word("".join(Number), exact=1) # Composite pattern for a number _pp_number = ( - _pp_num_flag.setResultsName(TokenField.Type.value) + _pp_num_flag.setResultsName(TokenField.Type) + pp.Group( - _pp_num_sign.setResultsName(TokenField.Sign.value) - + _pp_num_type.setResultsName(TokenField.Number.value) - ).setResultsName(TokenField.SignNumber.value) + _pp_num_sign.setResultsName(TokenField.Sign) + + _pp_num_type.setResultsName(TokenField.Number) + ).setResultsName(TokenField.SignNumber) + pp.Optional(pp.Literal(_s_num_no_space)).setResultsName( - TokenField.NoSpace.value + TokenField.NoSpace ) + pp.WordEnd() ) @@ -181,17 +170,17 @@ def pattern(self): #: Flag for whether the token is an "any content" token @property def is_any(self): - return _has_value(AnyMatchType, self._pr[0]) + return self._pr[0] in list(AnyMatchType) #: Flag for whether the token matches a literal string @property def is_str(self): - return _has_value(StringMatchType, self._pr[0]) + return self._pr[0] in list(StringMatchType) #: Flag for whether the token matches a number @property def is_num(self): - return _has_value(NumberMatchType, self._pr[0]) + return self._pr[0] in list(NumberMatchType) #: String matching type; |None| if token doesn't match a string @property @@ -213,9 +202,7 @@ def num_match_type(self): @property def number(self): if self.is_num: - return Number( - self._pr[TokenField.SignNumber.value][TokenField.Number.value] - ) + return Number(self._pr[TokenField.SignNumber][TokenField.Number]) else: return None @@ -223,9 +210,7 @@ def number(self): @property def sign(self): if self.is_num: - return Sign( - self._pr[TokenField.SignNumber.value][TokenField.Sign.value] - ) + return Sign(self._pr[TokenField.SignNumber][TokenField.Sign]) else: return None @@ -233,7 +218,7 @@ def sign(self): @property def space_after(self): if self.is_num: - return not TokenField.NoSpace.value in self._pr + return not TokenField.NoSpace in self._pr elif self.is_str: return True else: @@ -252,14 +237,14 @@ def __attrs_post_init__(self): elif self.is_str: self._pattern = self._string_pattern(self._pr[1]) - if self.capture and self._pr[0] == StringMatchType.Capture.value: + if self.capture and self._pr[0] == StringMatchType.Capture: self.needs_group_id = True self._pattern = self._group_enclose(self._pattern) elif self.is_num: self._pattern = self._get_number_pattern(self._pr) - if self.capture and self._pr[0] == NumberMatchType.Single.value: + if self.capture and self._pr[0] == NumberMatchType.Single: self.needs_group_id = True self._pattern = self._group_enclose(self._pattern) @@ -280,12 +265,8 @@ def _string_pattern(s): @classmethod def _get_number_pattern(cls, parse_result): """Return the correct number pattern given the parse result.""" - num = Number( - parse_result[TokenField.SignNumber.value][TokenField.Number.value] - ) - sign = Sign( - parse_result[TokenField.SignNumber.value][TokenField.Sign.value] - ) + num = Number(parse_result[TokenField.SignNumber][TokenField.Number]) + sign = Sign(parse_result[TokenField.SignNumber][TokenField.Sign]) return cls._numpats[num, sign] From 79e8fa224fdb75c18e6fe15a038947a4472b22f4 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Wed, 12 Sep 2018 23:14:54 -0400 Subject: [PATCH 20/44] TEST: Add invalid num to testdata NOTHING matches it... as it should be! --- pent/test/testdata.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pent/test/testdata.py b/pent/test/testdata.py index 199530e..710be41 100644 --- a/pent/test/testdata.py +++ b/pent/test/testdata.py @@ -438,6 +438,23 @@ (Number.General, Sign.Positive): False, (Number.General, Sign.Negative): True, (Number.General, Sign.Any): True, - } - # INVALID VALUES... '+-0.349', complex(?), etc. + }, + "+-0.39": { + (Number.Integer, Sign.Positive): False, + (Number.Integer, Sign.Negative): False, + (Number.Integer, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, + (Number.General, Sign.Positive): False, + (Number.General, Sign.Negative): False, + (Number.General, Sign.Any): False, + }, + # INVALID VALUES... complex(?), etc. } From 00eac87fb4c1fae836caf9bbb1bcc1bbb87910ef Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Thu, 13 Sep 2018 01:27:29 -0400 Subject: [PATCH 21/44] DEV: Complete rework of token system But now it's a LOT more general, and should hopefully avoid all of the ugliness of the <..< >..> preceding/following literals hanging on the number types. The tokens are still bulkier than I'd like, but they're not TOO bad. Probably quite readable once one gets used to the syntax? Closes #14. Also closes #9, with the expansion of the 'any' type to both capturing and non-capturing variants. Also closes #6 by obviating it, since this is now best handled by no-space flags and string literals. --- pent/__init__.py | 9 ++- pent/enums.py | 63 ++++++++++------ pent/parser.py | 159 +++++++++++++++++++++++++---------------- pent/test/pent_base.py | 24 ++++--- 4 files changed, 154 insertions(+), 101 deletions(-) diff --git a/pent/__init__.py b/pent/__init__.py index b9e5139..fba5cfd 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -33,9 +33,8 @@ "Number", "Sign", "TokenField", - "AnyMatchType", - "NumberMatchType", - "StringMatchType", + "Content", + "Quantity", "number_patterns", "wordify_pattern", "std_wordify", @@ -45,10 +44,10 @@ ] from .enums import Number, Sign, TokenField -from .enums import AnyMatchType, NumberMatchType, StringMatchType +from .enums import Content, Quantity from .errors import PentError, BadTokenError from .parser import Parser, Token, group_prefix from .patterns import number_patterns, wordify_pattern, std_wordify -__version__ = "0.1.dev1" +__version__ = "0.1.dev2" diff --git a/pent/enums.py b/pent/enums.py index 184b9ba..ea29b65 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -59,25 +59,38 @@ class Sign(str, Enum): Any = "." -class AnyMatchType(str, Enum): - """Enumeration for various 'any' match types.""" +class Content(str, Enum): + """Enumeration for the possible types of content.""" - #: Non-captured match - Ignore = "~" + #: Arbitrary match + Any = "~" + #: Literal string + String = "@" -class StringMatchType(str, Enum): - """Enumeration for the various match types on literal string fields.""" + #: Number + Number = "#" - #: Captured match - Capture = "=" - #: Ignored match - Ignore = "!" +# class AnyMatchType(str, Enum): +# """Enumeration for various 'any' match types.""" +# +# #: Non-captured match +# Ignore = "~" -class NumberMatchType(str, Enum): - """Enumeration for the various match types on the numeric fields.""" +# class Capture(str, Enum): +# """Enumeration for whether to store the matched content.""" +# +# #: Captured match +# Capture = "=" +# +# #: Ignored match +# Ignore = "!" + + +class Quantity(str, Enum): + """Enumeration for the various match quantities.""" #: Single value match Single = "." @@ -91,27 +104,33 @@ class NumberMatchType(str, Enum): #: Zero-or-more match ZeroOrMore = "*" - #: Suppressed match - Suppress = "#" - class TokenField(str, Enum): """Enumeration for fields within a mini-language number token.""" - #: Type of number field (single value, one-or-more, zero-or-more, etc.) + #: Content type (any, string, number) Type = "type" - #: Sign of acceptable values (any, positive, negative) - Sign = "sign" + #: Flag to suppress preceding space in the generated pattern + NoSpace = "no_space" - #: Format of the numerical value (int, float, scinot, decimal, general) - Number = "number" + #: Flag to ignore matched content when collecting into regex groups + Ignore = "ignore" + + #: Match quantity of the field (single value, one-or-more, zero-or-more, etc.) + Quantity = "quantity" + + #: Literal content, for a string match + Str = "str" #: Combined sign and number, for initial pattern group retrieval SignNumber = "sign_number" - #: Flag to suppress preceding space in the generated pattern - NoSpace = "no_space" + #: Format of the numerical value (int, float, scinot, decimal, general) + Number = "number" + + #: Sign of acceptable values (any, positive, negative) + Sign = "sign" if __name__ == "__main__": # pragma: no cover diff --git a/pent/parser.py b/pent/parser.py index a6aeb92..bb00282 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -28,7 +28,7 @@ import pyparsing as pp from .enums import Number, Sign, TokenField -from .enums import NumberMatchType, StringMatchType, AnyMatchType +from .enums import Content, Quantity from .errors import BadTokenError from .patterns import std_wordify_open, std_wordify_close @@ -38,48 +38,70 @@ # ## HELPERS ## group_prefix = "g" _s_any_flag = "~" -_s_num_no_space = "x" +_s_ignore = "!" +_s_no_space = "x" + +_pp_no_space = pp.Optional(pp.Literal(_s_no_space)).setResultsName( + TokenField.NoSpace +) +_pp_ignore = pp.Optional(pp.Literal(_s_ignore)).setResultsName( + TokenField.Ignore +) +_pp_quantity = pp.Word("".join(Quantity), exact=1).setResultsName( + TokenField.Quantity +) # ## ARBITRARY CONTENT ## # Tilde says anything may be here, including multiple words -_pp_any_flag = pp.Literal(_s_any_flag) +# Definitely want to give the option not to capture. Might ideally +# be the default NOT to capture here... +_pp_any_flag = ( + pp.Literal(_s_any_flag).setResultsName(TokenField.Type) + _pp_ignore +) # ## LITERAL STRING ## # Marker for the rest of the token to be a literal string -_pp_str_flag = pp.Word("".join(StringMatchType), exact=1) +_pp_str_flag = pp.Literal(Content.String.value).setResultsName(TokenField.Type) # Remainder of the content after the marker, spaces included -_pp_str_content = pp.Word(pp.printables + " ") +_pp_str_value = pp.Word(pp.printables + " ").setResultsName(TokenField.Str) # Composite pattern for a literal string -_pp_string = _pp_str_flag + _pp_str_content +_pp_string = ( + _pp_str_flag + _pp_no_space + _pp_ignore + _pp_quantity + _pp_str_value +) # ## NUMERICAL VALUE ## # Initial marker for a numerical value -_pp_num_flag = pp.Word("".join(NumberMatchType), exact=1) +_pp_num_flag = pp.Literal(Content.Number.value).setResultsName(TokenField.Type) # Marker for the sign of the value; period indicates either sign -_pp_num_sign = pp.Word("".join(Sign), exact=1) +_pp_num_sign = pp.Word("".join(Sign), exact=1).setResultsName(TokenField.Sign) # Marker for the number type to look for -_pp_num_type = pp.Word("".join(Number), exact=1) +_pp_num_type = pp.Word("".join(Number), exact=1).setResultsName( + TokenField.Number +) # Composite pattern for a number _pp_number = ( - _pp_num_flag.setResultsName(TokenField.Type) - + pp.Group( - _pp_num_sign.setResultsName(TokenField.Sign) - + _pp_num_type.setResultsName(TokenField.Number) - ).setResultsName(TokenField.SignNumber) - + pp.Optional(pp.Literal(_s_num_no_space)).setResultsName( - TokenField.NoSpace + _pp_num_flag + + _pp_no_space + + _pp_ignore + + _pp_quantity + + pp.Group(_pp_num_sign + _pp_num_type).setResultsName( + TokenField.SignNumber ) - + pp.WordEnd() ) + # ## COMBINED TOKEN PARSER ## -_pp_token = _pp_any_flag ^ _pp_string ^ _pp_number +_pp_token = ( + pp.StringStart() + + (_pp_any_flag ^ _pp_string ^ _pp_number) + + pp.StringEnd() +) # Will (presumably) eventually need to implement preceding/following # literal strings on the number specifications @@ -94,12 +116,18 @@ class Parser: @classmethod def convert_line(cls, line, *, capture_groups=True): - """Implement dirt-oversimple line converter.""" + """Convert line of tokens to regex. + + The constructed regex is required to match the entirety of a + line of text, using lookbehind and lookahead at the + start and end of the pattern, respectively. + + """ import shlex # Parse line into tokens, and then into Tokens tokens = shlex.split(line) - tokens = list(Token(_, capture=capture_groups) for _ in tokens) + tokens = list(Token(_, do_capture=capture_groups) for _ in tokens) # Zero-length start of line (or of entire string) match pattern = r"(^|(?<=\n))" @@ -119,7 +147,11 @@ def convert_line(cls, line, *, capture_groups=True): group_id += 1 tok_pattern = tok_pattern.format(str(group_id)) - if t.is_num: + if t.is_any: + pattern += tok_pattern + prior_no_space_token = False + + else: if not prior_no_space_token: tok_pattern = std_wordify_open(tok_pattern) @@ -131,10 +163,6 @@ def convert_line(cls, line, *, capture_groups=True): pattern += tok_pattern - else: - pattern += tok_pattern - prior_no_space_token = False - # Add required space or no space, depending on # what the token calls for, as long as it's not # the last token @@ -156,13 +184,13 @@ class Token: #: Mini-language token string to be parsed token = attr.ib() - #: Whether group captures should be added or not - capture = attr.ib(default=True) + #: Whether group capture should be added or not + do_capture = attr.ib(default=True) #: Flag for whether group ID substitution needs to be done needs_group_id = attr.ib(default=False, init=False, repr=False) - #: Compiled regex pattern from the token + #: Assembled regex pattern from the token, as |str| @property def pattern(self): return self._pattern @@ -170,33 +198,25 @@ def pattern(self): #: Flag for whether the token is an "any content" token @property def is_any(self): - return self._pr[0] in list(AnyMatchType) + return self._pr[TokenField.Type] == Content.Any #: Flag for whether the token matches a literal string @property def is_str(self): - return self._pr[0] in list(StringMatchType) + return self._pr[TokenField.Type] == Content.String #: Flag for whether the token matches a number @property def is_num(self): - return self._pr[0] in list(NumberMatchType) + return self._pr[TokenField.Type] == Content.Number - #: String matching type; |None| if token doesn't match a string + #: Match quantity; |None| for :attr:`pent.enums.Content.Any` @property - def str_match_type(self): - if self.is_str: - return StringMatchType(self._pr[0]) - else: + def match_quantity(self): + if self.is_any: return None - - #: Number matching type; |None| if token doesn't match a number - @property - def num_match_type(self): - if self.is_num: - return NumberMatchType(self._pr[0]) else: - return None + return Quantity(self._pr[TokenField.Quantity]) #: Number format matched; |None| if token doesn't match a number @property @@ -217,12 +237,15 @@ def sign(self): #: Flag for whether space should be provided for after the match @property def space_after(self): - if self.is_num: - return not TokenField.NoSpace in self._pr - elif self.is_str: - return True - else: + if self.is_any: return False + else: + return TokenField.NoSpace not in self._pr + + #: Flag for whether result should be ignored in returned output + @property + def ignore(self): + return TokenField.Ignore in self._pr def __attrs_post_init__(self): """Handle automatic creation stuff.""" @@ -232,21 +255,25 @@ def __attrs_post_init__(self): raise BadTokenError(self.token) from e if self.is_any: - self._pattern = ".*?" - - elif self.is_str: - self._pattern = self._string_pattern(self._pr[1]) - - if self.capture and self._pr[0] == StringMatchType.Capture: - self.needs_group_id = True - self._pattern = self._group_enclose(self._pattern) + self._pattern, self.needs_group_id = self._selective_group_enclose( + ".*?" + ) + return + # Only single, non-optional captures implemented for now, regardless of + # the Quantity flag in the token + if self.is_str: + self._pattern = self._string_pattern(self._pr[TokenField.Str]) elif self.is_num: self._pattern = self._get_number_pattern(self._pr) + else: + raise NotImplementedError( + "Unknown content type somehow specified!" + ) - if self.capture and self._pr[0] == NumberMatchType.Single: - self.needs_group_id = True - self._pattern = self._group_enclose(self._pattern) + self._pattern, self.needs_group_id = self._selective_group_enclose( + self._pattern + ) @staticmethod def _string_pattern(s): @@ -285,10 +312,16 @@ def _group_close(): """Create the closing pattern for a named group.""" return ")" - @classmethod - def _group_enclose(cls, pat): - """Enclose the pattern in the group enclosure.""" - return cls._group_open() + pat + cls._group_close() + def _selective_group_enclose(self, pat): + """Return token pattern enclosed in group IF it should be grouped. + + FIX THIS DOCSTRING, IT'S OUT OF DATE!!! + + """ + if self.do_capture and not self.ignore: + return (self._group_open() + pat + self._group_close(), True) + else: + return pat, False if __name__ == "__main__": # pragma: no cover diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 529f7c7..fae4545 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -105,7 +105,7 @@ def test_parser_single_line_space_delim(self): from .testdata import number_sign_vals as vals test_line = "This line contains the value {} with space delimit." - test_pat_template = "~ !contains ~ #{0}{1} ~" + test_pat_template = "~! @.contains ~ #.{0}{1} ~!" for v in vals: test_str = test_line.format(v) @@ -125,9 +125,9 @@ def test_string_capture(self): import pent test_line = "This is a string with a word and [symbol] in it." - test_pat_capture = "~ =word ~" - test_pat_ignore = "~ !word ~" - test_pat_symbol = "~ =[symbol] ~" + test_pat_capture = "~! @.word ~!" + test_pat_ignore = "~! @!.word ~!" + test_pat_symbol = "~! @.[symbol] ~!" with self.subTest("capture"): pat = self.prs.convert_line(test_pat_capture) @@ -154,7 +154,7 @@ def test_single_num_capture(self): from .testdata import number_sign_vals as vals test_line = "This is a string with {} in it." - test_pat_template = "~ .{0}{1} ~" + test_pat_template = "~! #.{0}{1} ~!" for v in vals: test_str = test_line.format(v) @@ -186,7 +186,7 @@ def test_single_nums_no_space(self): from .testdata import number_sign_vals as vals test_str = "This is a string with 123-456 in it." - test_pat = "~ ..ix .-i ~" + test_pat = "~! #x.+i #.-i ~!" npat = self.prs.convert_line(test_pat) @@ -196,7 +196,7 @@ def test_single_nums_no_space(self): self.assertEqual(m.group(pent.group_prefix + "1"), "123") self.assertEqual(m.group(pent.group_prefix + "2"), "-456") - @ut.skip("Implementing no-space-preceding first") + # @ut.skip("Implementing no-space-preceding first") def test_single_num_preceding_colon_capture(self): """Confirm single-number capture works, with preceding colon.""" import pent @@ -204,7 +204,7 @@ def test_single_num_preceding_colon_capture(self): from .testdata import number_sign_vals as vals test_line = "This is a string with :{} in it, after a colon." - test_pat_template = "~ .{0}{1} ~" + test_pat_template = "~! @x!.: #.{0}{1} ~!" for v in vals: test_str = test_line.format(v) @@ -231,7 +231,7 @@ def test_string_and_single_num_capture(self): from .testdata import number_sign_vals as vals test_line = "This is a string with {} in it." - test_pat_template = "~ =string ~ .{0}{1} ~" + test_pat_template = "~! @.string ~! #.{0}{1} ~!" for v in vals: test_str = test_line.format(v) @@ -257,9 +257,11 @@ def test_string_and_single_num_capture(self): def test_match_entire_line(self): """Confirm the tilde works to match an entire line.""" test_line = "This is a line with whatever weird (*#$(*&23646{}}{#$" - test_pat = "~" - pat = self.prs.convert_line(test_pat) + pat = self.prs.convert_line("~") + self.assertTrue(self.does_parse_match(pat, test_line)) + + pat = self.prs.convert_line("~!") self.assertTrue(self.does_parse_match(pat, test_line)) From cfc6e0e9b44376a1b0c400b6ca6183ef64886977 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Thu, 13 Sep 2018 14:54:12 -0400 Subject: [PATCH 22/44] TEST: Refactor to fast/slow, add chain nospace Closes #11. Closes #12. Now can run quick tests with '-f'; all including slow with '-a'. No-space token chaining should be pretty robustly tested now. Docs admonition that needs to be made about chaining no-space number tokens is recorded as #23. --- pent/enums.py | 17 ------ pent/test/__init__.py | 4 +- pent/test/pent_base.py | 117 +++++++++++++++++++++++++++++++++++++++-- pent/test/testdata.py | 21 ++++++++ tests.py | 15 +++++- 5 files changed, 150 insertions(+), 24 deletions(-) diff --git a/pent/enums.py b/pent/enums.py index ea29b65..2cef865 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -72,23 +72,6 @@ class Content(str, Enum): Number = "#" -# class AnyMatchType(str, Enum): -# """Enumeration for various 'any' match types.""" -# -# #: Non-captured match -# Ignore = "~" - - -# class Capture(str, Enum): -# """Enumeration for whether to store the matched content.""" -# -# #: Captured match -# Capture = "=" -# -# #: Ignored match -# Ignore = "!" - - class Quantity(str, Enum): """Enumeration for the various match quantities.""" diff --git a/pent/test/__init__.py b/pent/test/__init__.py index 0909734..3090209 100644 --- a/pent/test/__init__.py +++ b/pent/test/__init__.py @@ -17,6 +17,6 @@ from __future__ import absolute_import -__all__ = ["suite_expect_good"] +__all__ = ["suite_base"] -from .pent_base import suite_expect_good +from .pent_base import suite_base diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index fae4545..229aa64 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -196,7 +196,6 @@ def test_single_nums_no_space(self): self.assertEqual(m.group(pent.group_prefix + "1"), "123") self.assertEqual(m.group(pent.group_prefix + "2"), "-456") - # @ut.skip("Implementing no-space-preceding first") def test_single_num_preceding_colon_capture(self): """Confirm single-number capture works, with preceding colon.""" import pent @@ -265,18 +264,130 @@ def test_match_entire_line(self): self.assertTrue(self.does_parse_match(pat, test_line)) -def suite_expect_good(): - """Create and return the test suite for expect-good tests.""" +class TestPentTokens(ut.TestCase, SuperPent): + """Direct tests on the Token class.""" + + def test_group_enclosures(self): + """Ensure 'ignore' flag is properly set.""" + import pent + + testname_fmt = "{0}_{1}" + token_fmt = { + pent.Content.Any: "~{0}", + pent.Content.String: "@{0}.thing", + pent.Content.Number: "#{0}..i", + } + + for c, i in itt.product(pent.Content, (True, False)): + t = pent.Token(token_fmt[c].format("!" if i else "")) + with self.subTest(testname_fmt.format(c, i)): + self.assertEqual(t.ignore, i) + + +class TestPentParserPatternsSlow(ut.TestCase, SuperPent): + """SLOW tests confirming pattern matching of Parser regexes.""" + + import pent + + prs = pent.Parser() + + def test_three_token_sequence(self): + "Ensure combinatorial token sequence parses correctly." "" + import pent + + from .testdata import number_patterns as nps + + pat_template = "~! {0} {1} {2} ~!" + str_template = "String! {0}{1}{2}{3}{4} More String!" + str_pat = {"foo": "@{0}{1}{2}foo"} + + testname_template = "{0}_{1}_{2}_{3}_{4}" + + str_or_num = (pent.Content.String, pent.Content.Number) + t_f = (True, False) + + for c1, s1, c2, s2, c3 in itt.product( + str_or_num, t_f, str_or_num, t_f, str_or_num + ): + if (c1 is c2 and not s1) or (c2 is c3 and not s2): + # No reason to have no-space strings against one another. + # No-space numbers adjacent to one another make no syntactic sense. + continue + + vals1 = str_pat if c1 == pent.Content.String else nps.keys() + vals2 = str_pat if c2 == pent.Content.String else nps.keys() + vals3 = str_pat if c3 == pent.Content.String else nps.keys() + + for v1, v2, v3 in itt.product(vals1, vals2, vals3): + p1 = (str_pat if c1 == pent.Content.String else nps)[ + v1 + ].format( + pent.parser._s_no_space if not s1 else "", + "", + pent.Quantity.Single, + ) + p2 = (str_pat if c2 == pent.Content.String else nps)[ + v2 + ].format( + pent.parser._s_no_space if not s2 else "", + "", + pent.Quantity.Single, + ) + p3 = (str_pat if c3 == pent.Content.String else nps)[ + v3 + ].format("", "", pent.Quantity.Single) + + test_pat = pat_template.format(p1, p2, p3) + test_str = str_template.format( + v1, " " if s1 else "", v2, " " if s2 else "", v3 + ) + + with self.subTest( + testname_template.format(v1, s1, v2, s2, v3) + ): + npat = self.prs.convert_line(test_pat) + + m = re.search(npat, test_str) + + self.assertIsNotNone(m, msg=test_pat) + self.assertEqual( + m.group(pent.group_prefix + "1"), + v1, + msg=test_pat + " :: " + test_str, + ) + self.assertEqual( + m.group(pent.group_prefix + "2"), + v2, + msg=test_pat + " :: " + test_str, + ) + self.assertEqual( + m.group(pent.group_prefix + "3"), + v3, + msg=test_pat + " :: " + test_str, + ) + + +def suite_base(): + """Create and return the test suite for base tests.""" s = ut.TestSuite() tl = ut.TestLoader() s.addTests( [ tl.loadTestsFromTestCase(TestPentCorePatterns), tl.loadTestsFromTestCase(TestPentParserPatterns), + tl.loadTestsFromTestCase(TestPentTokens), ] ) return s +def suite_base_slow(): + """Create and return the test suite for SLOW base tests.""" + s = ut.TestSuite() + tl = ut.TestLoader() + s.addTests([tl.loadTestsFromTestCase(TestPentParserPatternsSlow)]) + return s + + if __name__ == "__main__": print("Module not executable.") diff --git a/pent/test/testdata.py b/pent/test/testdata.py index 710be41..a3fa8d8 100644 --- a/pent/test/testdata.py +++ b/pent/test/testdata.py @@ -29,6 +29,27 @@ from pent import Number, Sign +number_token_template = "#{{0}}{{1}}{{2}}{0}" + +number_patterns = { + "123": number_token_template.format(".i"), + "-123": number_token_template.format("-i"), + "+123": number_token_template.format("+i"), + "0.2": number_token_template.format(".f"), + "-.285": number_token_template.format("-f"), + "+315.": number_token_template.format("+f"), + "3e5": number_token_template.format(".s"), + "-.13e+5": number_token_template.format("-s"), + "+3.1e-5": number_token_template.format("+s"), + ".266": number_token_template.format(".d"), + "-15.285": number_token_template.format("-d"), + "+315.185": number_token_template.format("+d"), + "35": number_token_template.format(".g"), + "-.13": number_token_template.format("-g"), + "+3.1e+15": number_token_template.format("+g"), +} + +assert len(number_patterns) == 15 number_sign_vals = { "0": { diff --git a/tests.py b/tests.py index cb43812..ff0828b 100644 --- a/tests.py +++ b/tests.py @@ -36,6 +36,8 @@ class AP(object): ALL = "all" + FAST = "fast" + PFX = "--{0}" @@ -57,6 +59,12 @@ def get_parser(): action="store_true", help="Run all tests (overrides any other selections)", ) + prs.add_argument( + AP.PFX.format(AP.FAST), + "-f", + action="store_true", + help="Run only 'fast' tests", + ) # Return the parser return prs @@ -87,8 +95,11 @@ def addsuiteif(suite, flags): ts.addTest(suite) # Add commandline-indicated tests per-group - # Expect-good tests - addsuiteif(pent.test.pent_base.suite_expect_good(), [AP.ALL]) + # Fast tests + addsuiteif(pent.test.pent_base.suite_base(), [AP.ALL, AP.FAST]) + + # Slow tests + addsuiteif(pent.test.pent_base.suite_base_slow(), [AP.ALL]) # Create the test runner and execute ttr = ut.TextTestRunner(buffer=True, verbosity=(2 if params["v"] else 1)) From 7df527028909abc3d86572660ad1b2e65949acf8 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Thu, 13 Sep 2018 23:25:06 -0400 Subject: [PATCH 23/44] FLAKE8: Bring to compliance and enable --- .travis.yml | 2 +- pent/enums.py | 3 ++- pent/errors.py | 4 ++++ pent/parser.py | 18 +++++++++--------- pent/test/pent_base.py | 9 ++++----- requirements-travis.txt | 3 +++ 6 files changed, 23 insertions(+), 16 deletions(-) diff --git a/.travis.yml b/.travis.yml index 63ce14c..179196f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ python: - 3.7-dev script: - coverage run tests.py -a -# - flake8 pent + - flake8 pent # - echo $TRAVIS_PYTHON_VERSION | grep -e '^3\.6' && sh -c 'cd doc; make doctest' || echo 'No doctest.' - echo $TRAVIS_PYTHON_VERSION | grep -e '^3\.6' && codecov || echo "No codecov." diff --git a/pent/enums.py b/pent/enums.py index 2cef865..13a77c3 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -100,7 +100,8 @@ class TokenField(str, Enum): #: Flag to ignore matched content when collecting into regex groups Ignore = "ignore" - #: Match quantity of the field (single value, one-or-more, zero-or-more, etc.) + #: Match quantity of the field (single value, optional, + #: one-or-more, zero-or-more, etc.) Quantity = "quantity" #: Literal content, for a string match diff --git a/pent/errors.py b/pent/errors.py index c0c6367..3d5ebba 100644 --- a/pent/errors.py +++ b/pent/errors.py @@ -26,6 +26,8 @@ class PentError(Exception): # pragma: no cover + """Superclass for all custom |pent| errors.""" + pass @@ -33,9 +35,11 @@ class BadTokenError(PentError): # pragma: no cover """Raised during attempts to parse an invalid token.""" def __init__(self, token): + """Instantiate a ``BadTokenError``.""" self.token = token def __str__(self): + """Generate a more-informative error message.""" return "'{}' is an invalid pent token".format(self.token) diff --git a/pent/parser.py b/pent/parser.py index bb00282..8b17076 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -190,61 +190,61 @@ class Token: #: Flag for whether group ID substitution needs to be done needs_group_id = attr.ib(default=False, init=False, repr=False) - #: Assembled regex pattern from the token, as |str| @property def pattern(self): + """Return assembled regex pattern from the token, as |str|.""" return self._pattern - #: Flag for whether the token is an "any content" token @property def is_any(self): + """Return flag for whether the token is an "any content" token.""" return self._pr[TokenField.Type] == Content.Any - #: Flag for whether the token matches a literal string @property def is_str(self): + """Return flag for whether the token matches a literal string.""" return self._pr[TokenField.Type] == Content.String - #: Flag for whether the token matches a number @property def is_num(self): + """Return flag for whether the token matches a number.""" return self._pr[TokenField.Type] == Content.Number - #: Match quantity; |None| for :attr:`pent.enums.Content.Any` @property def match_quantity(self): + """Return match quantity; |None| for :attr:`pent.enums.Content.Any`.""" if self.is_any: return None else: return Quantity(self._pr[TokenField.Quantity]) - #: Number format matched; |None| if token doesn't match a number @property def number(self): + """#: Return number format; |None| if token doesn't match a number.""" if self.is_num: return Number(self._pr[TokenField.SignNumber][TokenField.Number]) else: return None - #: Number sign matched; |None| if token doesn't match a number @property def sign(self): + """#: Return number sign; |None| if token doesn't match a number.""" if self.is_num: return Sign(self._pr[TokenField.SignNumber][TokenField.Sign]) else: return None - #: Flag for whether space should be provided for after the match @property def space_after(self): + """Return flag for whether post-match space should be provided for.""" if self.is_any: return False else: return TokenField.NoSpace not in self._pr - #: Flag for whether result should be ignored in returned output @property def ignore(self): + """Return flag for whether a regex match group should be created.""" return TokenField.Ignore in self._pr def __attrs_post_init__(self): diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 229aa64..8ed0d65 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -183,8 +183,6 @@ def test_single_nums_no_space(self): """ import pent - from .testdata import number_sign_vals as vals - test_str = "This is a string with 123-456 in it." test_pat = "~! #x.+i #.-i ~!" @@ -292,7 +290,7 @@ class TestPentParserPatternsSlow(ut.TestCase, SuperPent): prs = pent.Parser() def test_three_token_sequence(self): - "Ensure combinatorial token sequence parses correctly." "" + """Ensure combinatorial token sequence parses correctly.""" import pent from .testdata import number_patterns as nps @@ -310,8 +308,9 @@ def test_three_token_sequence(self): str_or_num, t_f, str_or_num, t_f, str_or_num ): if (c1 is c2 and not s1) or (c2 is c3 and not s2): - # No reason to have no-space strings against one another. - # No-space numbers adjacent to one another make no syntactic sense. + # No reason to have no-space strings against one another; + # no-space numbers adjacent to one another make + # no syntactic sense. continue vals1 = str_pat if c1 == pent.Content.String else nps.keys() diff --git a/requirements-travis.txt b/requirements-travis.txt index 275ad96..ecab9ab 100644 --- a/requirements-travis.txt +++ b/requirements-travis.txt @@ -2,3 +2,6 @@ attrs<18 pyparsing coverage codecov +flake8 +flake8-docstrings + From feb6b196681b84cb0360ba570294d4710edd94ee Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Fri, 14 Sep 2018 13:06:07 -0400 Subject: [PATCH 24/44] TEST: Check presence/absence of group tags Closes #22. --- pent/test/pent_base.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 8ed0d65..4274100 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -94,6 +94,23 @@ class TestPentParserPatterns(ut.TestCase, SuperPent): prs = pent.Parser() + def test_group_tags_or_not(self): + """Confirm group tags are added when needed; omitted when not.""" + import pent + + patterns = { + pent.Content.Any: "~{}", + pent.Content.String: "@{}.this", + pent.Content.Number: "#{}..g", + } + + for content, capture in itt.product(pent.Content, (True, False)): + test_name = "{0}_{1}".format(content, capture) + with self.subTest(test_name): + test_pat = patterns[content].format("" if capture else "!") + test_rx = self.prs.convert_line(test_pat) + self.assertEqual(capture, "(?P<" in test_rx, msg=test_pat) + def test_parser_single_line_space_delim(self): """Confirm parser works on single lines with space-delimited values. From 6ead299722ce6d010a61cefc56cd72db41002b99 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Fri, 14 Sep 2018 13:31:22 -0400 Subject: [PATCH 25/44] TEST: Expand 'any' checks Expand entire-line 'any' matching to check both capturing and non-capturing cases. Add new test to ensure 'any' matches on substrings of a line work as expected. Closes #19. --- pent/test/pent_base.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 4274100..32429f7 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -270,13 +270,39 @@ def test_string_and_single_num_capture(self): def test_match_entire_line(self): """Confirm the tilde works to match an entire line.""" + import pent + test_line = "This is a line with whatever weird (*#$(*&23646{}}{#$" - pat = self.prs.convert_line("~") - self.assertTrue(self.does_parse_match(pat, test_line)) + with self.subTest("capture"): + pat = self.prs.convert_line("~") + self.assertTrue(self.does_parse_match(pat, test_line)) + + m = re.search(pat, test_line) + self.assertEqual(test_line, m.group(pent.group_prefix + "1")) + + with self.subTest("no_capture"): + pat = self.prs.convert_line("~!") + self.assertTrue(self.does_parse_match(pat, test_line)) + + m = re.search(pat, test_line) + self.assertRaises(IndexError, m.group, pent.group_prefix + "1") + + def test_any_token_capture_ranges(self): + """Confirm 'any' captures work as expected with other tokens.""" + import pent + + test_line_start = "This is a line " + test_line_end = " with a number in brackets in the middle." + test_num = "2e-4" + test_line = test_line_start + "[" + test_num + "]" + test_line_end + + pat = pent.Parser().convert_line("~ @x!.[ #x..g @x!.] ~") + m = re.search(pat, test_line) - pat = self.prs.convert_line("~!") - self.assertTrue(self.does_parse_match(pat, test_line)) + self.assertEqual(m.group(pent.group_prefix + "1"), test_line_start) + self.assertEqual(m.group(pent.group_prefix + "2"), test_num) + self.assertEqual(m.group(pent.group_prefix + "3"), test_line_end) class TestPentTokens(ut.TestCase, SuperPent): From 60a6b134fdc26890822e9462d0357d1f948a73d3 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Fri, 14 Sep 2018 13:45:49 -0400 Subject: [PATCH 26/44] TEST: Add likely common end-of-sentence test Have to explicitly put in the period as a literal string, and use the no-space-after on the number token, but it works fine. Closes #1. --- pent/test/pent_base.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 32429f7..bc24d4d 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -268,6 +268,24 @@ def test_string_and_single_num_capture(self): ) self.assertEqual(m.group(pent.group_prefix + "2"), v) + def number_ending_sentence(self): + """Check that a number at the end of a sentence is matched correctly.""" + import pent + + from .testdata import number_patterns as npats + + test_line = "This sentence ends with a number {}." + test_pat = "~! {} @!.." + + for n in npats: + token = npats[n].format("", "", ".") + with self.subTest(token): + pat = test_pat.format(token) + m = re.search(pat, test_line.format(n)) + + self.assertIsNotNone(m, msg=token) + self.assertEqual(n, m.group(pent.group_prefix + "1")) + def test_match_entire_line(self): """Confirm the tilde works to match an entire line.""" import pent From 0b846a04cdae237c628bc32dd247f5094938b18b Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Fri, 14 Sep 2018 13:55:26 -0400 Subject: [PATCH 27/44] TEST: Check manual concat of two rx patterns Works! Just need to tag an explicit newline betwen them. Closes #21. Also add Win helper for running flake8, to show the offending lines of source. --- flake.bat | 3 +++ pent/test/pent_base.py | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 flake.bat diff --git a/flake.bat b/flake.bat new file mode 100644 index 0000000..009f7a3 --- /dev/null +++ b/flake.bat @@ -0,0 +1,3 @@ +@echo off + +flake8 --show-source pent diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index bc24d4d..68b5779 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -269,7 +269,7 @@ def test_string_and_single_num_capture(self): self.assertEqual(m.group(pent.group_prefix + "2"), v) def number_ending_sentence(self): - """Check that a number at the end of a sentence is matched correctly.""" + """Check that a number at the end of a sentence matches correctly.""" import pent from .testdata import number_patterns as npats @@ -322,6 +322,20 @@ def test_any_token_capture_ranges(self): self.assertEqual(m.group(pent.group_prefix + "2"), test_num) self.assertEqual(m.group(pent.group_prefix + "3"), test_line_end) + def test_manual_two_lines(self): + """Run manual check on concatenating two single-line regexes.""" + test_str = "This is line one: 12345 \nAnd this is line two: -3e-5" + + test_pat_1 = "~! @!.one: #!.+i" + test_pat_2 = "~! @!.two: #!.-s" + + cp_1 = self.prs.convert_line(test_pat_1) + cp_2 = self.prs.convert_line(test_pat_2) + + m = re.search(cp_1 + r"\n" + cp_2, test_str) + + self.assertIsNotNone(m) + class TestPentTokens(ut.TestCase, SuperPent): """Direct tests on the Token class.""" From 1ea8cda043e34b39be7818a0f9344afc8a7e9314 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Sun, 16 Sep 2018 21:07:02 -0400 Subject: [PATCH 28/44] TEST: Add simple token property tests & 'no cover' Token.number and Token.sign are now (simply) tested for the patterns given in testdata.number_patterns. The NotImplementedError 'else' in Token.__attrs_post_init__ is now excluded from coverage. It SHOULD never be reached... --- pent/parser.py | 2 +- pent/test/pent_base.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/pent/parser.py b/pent/parser.py index 8b17076..0784cff 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -266,7 +266,7 @@ def __attrs_post_init__(self): self._pattern = self._string_pattern(self._pr[TokenField.Str]) elif self.is_num: self._pattern = self._get_number_pattern(self._pr) - else: + else: # pragma: no cover raise NotImplementedError( "Unknown content type somehow specified!" ) diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 68b5779..131e847 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -340,6 +340,12 @@ def test_manual_two_lines(self): class TestPentTokens(ut.TestCase, SuperPent): """Direct tests on the Token class.""" + def test_arbitrary_bad_token(self): + """Confirm bad tokens raise errors.""" + import pent + + self.assertRaises(pent.BadTokenError, pent.Token, "abcd") + def test_group_enclosures(self): """Ensure 'ignore' flag is properly set.""" import pent @@ -356,6 +362,40 @@ def test_group_enclosures(self): with self.subTest(testname_fmt.format(c, i)): self.assertEqual(t.ignore, i) + def test_number_property(self): + """Ensure t.number properties return correct values.""" + import pent + + from .testdata import number_patterns as npats + + for p in npats.values(): + pat = p.format("", "", pent.Quantity.Single) + with self.subTest(pat): + self.assertEqual(pent.Token(pat).number, pent.Number(p[-1])) + + with self.subTest("string"): + self.assertEqual(pent.Token("@.abcd").number, None) + + with self.subTest("any"): + self.assertEqual(pent.Token("~").number, None) + + def test_sign_property(self): + """Ensure t.sign properties return correct values.""" + import pent + + from .testdata import number_patterns as npats + + for p in npats.values(): + pat = p.format("", "", pent.Quantity.Single) + with self.subTest(pat): + self.assertEqual(pent.Token(pat).sign, pent.Sign(p[-2])) + + with self.subTest("string"): + self.assertEqual(pent.Token("@.abcd").sign, None) + + with self.subTest("any"): + self.assertEqual(pent.Token("~").sign, None) + class TestPentParserPatternsSlow(ut.TestCase, SuperPent): """SLOW tests confirming pattern matching of Parser regexes.""" From 3d938e182772d94891e5dd051015478c611bdd9f Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Sun, 16 Sep 2018 21:10:07 -0400 Subject: [PATCH 29/44] DEV: Convert Parser and Token to slots=True May end up converting Parser back, but I don't think so. Closes #28. --- pent/parser.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pent/parser.py b/pent/parser.py index 0784cff..6a9111f 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -110,7 +110,7 @@ # ## PARSER CLASS FOR EXTERNAL USE ## -@attr.s +@attr.s(slots=True) class Parser: """Mini-language parser for structured numerical data.""" @@ -175,7 +175,7 @@ def convert_line(cls, line, *, capture_groups=True): return pattern -@attr.s +@attr.s(slots=True) class Token: """Encapsulates transforming mini-language patterns tokens into regex.""" @@ -190,6 +190,10 @@ class Token: #: Flag for whether group ID substitution needs to be done needs_group_id = attr.ib(default=False, init=False, repr=False) + # Internal pyparsing result and generated regex pattern + _pr = attr.ib(default=None, init=False, repr=False) + _pattern = attr.ib(default=None, init=False, repr=False) + @property def pattern(self): """Return assembled regex pattern from the token, as |str|.""" From 149951e5cebd452dfaed5dc8df3075dedc8c7f57 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Sun, 16 Sep 2018 22:09:09 -0400 Subject: [PATCH 30/44] REFACTOR: Convert 'non-capture' to default behavior Most patterns will probably have more ignored tokens than captured tokens. Should make for cleaner input with 'non-capture' being the default. Closes #30. --- pent/enums.py | 2 +- pent/parser.py | 26 ++++++------ pent/test/pent_base.py | 96 ++++++++++++++++++++++-------------------- 3 files changed, 63 insertions(+), 61 deletions(-) diff --git a/pent/enums.py b/pent/enums.py index 13a77c3..5143d23 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -98,7 +98,7 @@ class TokenField(str, Enum): NoSpace = "no_space" #: Flag to ignore matched content when collecting into regex groups - Ignore = "ignore" + Capture = "capture" #: Match quantity of the field (single value, optional, #: one-or-more, zero-or-more, etc.) diff --git a/pent/parser.py b/pent/parser.py index 6a9111f..565210b 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -38,14 +38,14 @@ # ## HELPERS ## group_prefix = "g" _s_any_flag = "~" -_s_ignore = "!" +_s_capture = "!" _s_no_space = "x" _pp_no_space = pp.Optional(pp.Literal(_s_no_space)).setResultsName( TokenField.NoSpace ) -_pp_ignore = pp.Optional(pp.Literal(_s_ignore)).setResultsName( - TokenField.Ignore +_pp_capture = pp.Optional(pp.Literal(_s_capture)).setResultsName( + TokenField.Capture ) _pp_quantity = pp.Word("".join(Quantity), exact=1).setResultsName( TokenField.Quantity @@ -57,7 +57,7 @@ # Definitely want to give the option not to capture. Might ideally # be the default NOT to capture here... _pp_any_flag = ( - pp.Literal(_s_any_flag).setResultsName(TokenField.Type) + _pp_ignore + pp.Literal(_s_any_flag).setResultsName(TokenField.Type) + _pp_capture ) # ## LITERAL STRING ## @@ -69,7 +69,7 @@ # Composite pattern for a literal string _pp_string = ( - _pp_str_flag + _pp_no_space + _pp_ignore + _pp_quantity + _pp_str_value + _pp_str_flag + _pp_no_space + _pp_capture + _pp_quantity + _pp_str_value ) # ## NUMERICAL VALUE ## @@ -88,7 +88,7 @@ _pp_number = ( _pp_num_flag + _pp_no_space - + _pp_ignore + + _pp_capture + _pp_quantity + pp.Group(_pp_num_sign + _pp_num_type).setResultsName( TokenField.SignNumber @@ -103,9 +103,6 @@ + pp.StringEnd() ) -# Will (presumably) eventually need to implement preceding/following -# literal strings on the number specifications - # ## PARSER CLASS FOR EXTERNAL USE ## @@ -143,9 +140,10 @@ def convert_line(cls, line, *, capture_groups=True): for i, t in enumerate(tokens): tok_pattern = t.pattern + if t.needs_group_id: - group_id += 1 tok_pattern = tok_pattern.format(str(group_id)) + group_id += 1 if t.is_any: pattern += tok_pattern @@ -247,9 +245,9 @@ def space_after(self): return TokenField.NoSpace not in self._pr @property - def ignore(self): - """Return flag for whether a regex match group should be created.""" - return TokenField.Ignore in self._pr + def capture(self): + """Return flag for whether a regex capture group should be created.""" + return TokenField.Capture in self._pr def __attrs_post_init__(self): """Handle automatic creation stuff.""" @@ -322,7 +320,7 @@ def _selective_group_enclose(self, pat): FIX THIS DOCSTRING, IT'S OUT OF DATE!!! """ - if self.do_capture and not self.ignore: + if self.do_capture and self.capture: return (self._group_open() + pat + self._group_close(), True) else: return pat, False diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 131e847..169ae96 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -37,7 +37,7 @@ class SuperPent: @staticmethod def does_parse_match(re_pat, s): - """Run an individual parse test on `s` using regex pattern `re_pat`.""" + """Run match-or-not test on `s` using regex pattern `re_pat`.""" m = re.search(re_pat, s) return m is not None @@ -107,7 +107,7 @@ def test_group_tags_or_not(self): for content, capture in itt.product(pent.Content, (True, False)): test_name = "{0}_{1}".format(content, capture) with self.subTest(test_name): - test_pat = patterns[content].format("" if capture else "!") + test_pat = patterns[content].format("!" if capture else "") test_rx = self.prs.convert_line(test_pat) self.assertEqual(capture, "(?P<" in test_rx, msg=test_pat) @@ -122,7 +122,7 @@ def test_parser_single_line_space_delim(self): from .testdata import number_sign_vals as vals test_line = "This line contains the value {} with space delimit." - test_pat_template = "~! @.contains ~ #.{0}{1} ~!" + test_pat_template = "~ @!.contains ~! #!.{0}{1} ~" for v in vals: test_str = test_line.format(v) @@ -142,27 +142,27 @@ def test_string_capture(self): import pent test_line = "This is a string with a word and [symbol] in it." - test_pat_capture = "~! @.word ~!" - test_pat_ignore = "~! @!.word ~!" - test_pat_symbol = "~! @.[symbol] ~!" + test_pat_capture = "~ @!.word ~" + test_pat_ignore = "~ @.word ~" + test_pat_symbol = "~ @!.[symbol] ~" with self.subTest("capture"): pat = self.prs.convert_line(test_pat_capture) m = re.search(pat, test_line) self.assertIsNotNone(m) - self.assertEqual(m.group(pent.group_prefix + "1"), "word") + self.assertEqual(m.group(pent.group_prefix + "0"), "word") with self.subTest("ignore"): pat = self.prs.convert_line(test_pat_ignore) m = re.search(pat, test_line) self.assertIsNotNone(m) - self.assertRaises(IndexError, m.group, pent.group_prefix + "1") + self.assertRaises(IndexError, m.group, pent.group_prefix + "0") with self.subTest("symbol"): pat = self.prs.convert_line(test_pat_symbol) m = re.search(pat, test_line) self.assertIsNotNone(m) - self.assertEqual(m.group(pent.group_prefix + "1"), "[symbol]") + self.assertEqual(m.group(pent.group_prefix + "0"), "[symbol]") def test_single_num_capture(self): """Confirm single-number capture works.""" @@ -171,7 +171,7 @@ def test_single_num_capture(self): from .testdata import number_sign_vals as vals test_line = "This is a string with {} in it." - test_pat_template = "~! #.{0}{1} ~!" + test_pat_template = "~ #!.{0}{1} ~" for v in vals: test_str = test_line.format(v) @@ -189,7 +189,7 @@ def test_single_num_capture(self): ) if m is not None: - self.assertEqual(m.group(pent.group_prefix + "1"), v) + self.assertEqual(m.group(pent.group_prefix + "0"), v) def test_single_nums_no_space(self): """Confirm two-number capture works, with no intervening space. @@ -201,15 +201,15 @@ def test_single_nums_no_space(self): import pent test_str = "This is a string with 123-456 in it." - test_pat = "~! #x.+i #.-i ~!" + test_pat = "~ #x!.+i #!.-i ~" npat = self.prs.convert_line(test_pat) m = re.search(npat, test_str) self.assertIsNotNone(m) - self.assertEqual(m.group(pent.group_prefix + "1"), "123") - self.assertEqual(m.group(pent.group_prefix + "2"), "-456") + self.assertEqual(m.group(pent.group_prefix + "0"), "123") + self.assertEqual(m.group(pent.group_prefix + "1"), "-456") def test_single_num_preceding_colon_capture(self): """Confirm single-number capture works, with preceding colon.""" @@ -218,7 +218,7 @@ def test_single_num_preceding_colon_capture(self): from .testdata import number_sign_vals as vals test_line = "This is a string with :{} in it, after a colon." - test_pat_template = "~! @x!.: #.{0}{1} ~!" + test_pat_template = "~ @x.: #!.{0}{1} ~" for v in vals: test_str = test_line.format(v) @@ -236,7 +236,7 @@ def test_single_num_preceding_colon_capture(self): ) if m is not None: - self.assertEqual(m.group(pent.group_prefix + "1"), v) + self.assertEqual(m.group(pent.group_prefix + "0"), v) def test_string_and_single_num_capture(self): """Confirm multiple capture of string and single number.""" @@ -245,7 +245,7 @@ def test_string_and_single_num_capture(self): from .testdata import number_sign_vals as vals test_line = "This is a string with {} in it." - test_pat_template = "~! @.string ~! #.{0}{1} ~!" + test_pat_template = "~ @!.string ~ #!.{0}{1} ~" for v in vals: test_str = test_line.format(v) @@ -264,27 +264,31 @@ def test_string_and_single_num_capture(self): if m is not None: self.assertEqual( - m.group(pent.group_prefix + "1"), "string" + m.group(pent.group_prefix + "0"), "string" ) - self.assertEqual(m.group(pent.group_prefix + "2"), v) + self.assertEqual(m.group(pent.group_prefix + "1"), v) - def number_ending_sentence(self): + def test_number_ending_sentence(self): """Check that a number at the end of a sentence matches correctly.""" import pent from .testdata import number_patterns as npats test_line = "This sentence ends with a number {}." - test_pat = "~! {} @!.." + test_pat = "~ {} @.." for n in npats: - token = npats[n].format("", "", ".") + token = npats[n].format( + pent.parser._s_no_space, + pent.parser._s_capture, + pent.Quantity.Single, + ) with self.subTest(token): - pat = test_pat.format(token) + pat = self.prs.convert_line(test_pat.format(token)) m = re.search(pat, test_line.format(n)) - self.assertIsNotNone(m, msg=token) - self.assertEqual(n, m.group(pent.group_prefix + "1")) + self.assertIsNotNone(m, msg=test_line.format(n) + token) + self.assertEqual(n, m.group(pent.group_prefix + "0")) def test_match_entire_line(self): """Confirm the tilde works to match an entire line.""" @@ -293,18 +297,18 @@ def test_match_entire_line(self): test_line = "This is a line with whatever weird (*#$(*&23646{}}{#$" with self.subTest("capture"): - pat = self.prs.convert_line("~") + pat = self.prs.convert_line("~!") self.assertTrue(self.does_parse_match(pat, test_line)) m = re.search(pat, test_line) - self.assertEqual(test_line, m.group(pent.group_prefix + "1")) + self.assertEqual(test_line, m.group(pent.group_prefix + "0")) with self.subTest("no_capture"): - pat = self.prs.convert_line("~!") + pat = self.prs.convert_line("~") self.assertTrue(self.does_parse_match(pat, test_line)) m = re.search(pat, test_line) - self.assertRaises(IndexError, m.group, pent.group_prefix + "1") + self.assertRaises(IndexError, m.group, pent.group_prefix + "0") def test_any_token_capture_ranges(self): """Confirm 'any' captures work as expected with other tokens.""" @@ -315,19 +319,19 @@ def test_any_token_capture_ranges(self): test_num = "2e-4" test_line = test_line_start + "[" + test_num + "]" + test_line_end - pat = pent.Parser().convert_line("~ @x!.[ #x..g @x!.] ~") + pat = pent.Parser().convert_line("~! @x.[ #x!..g @x.] ~!") m = re.search(pat, test_line) - self.assertEqual(m.group(pent.group_prefix + "1"), test_line_start) - self.assertEqual(m.group(pent.group_prefix + "2"), test_num) - self.assertEqual(m.group(pent.group_prefix + "3"), test_line_end) + self.assertEqual(m.group(pent.group_prefix + "0"), test_line_start) + self.assertEqual(m.group(pent.group_prefix + "1"), test_num) + self.assertEqual(m.group(pent.group_prefix + "2"), test_line_end) def test_manual_two_lines(self): """Run manual check on concatenating two single-line regexes.""" test_str = "This is line one: 12345 \nAnd this is line two: -3e-5" - test_pat_1 = "~! @!.one: #!.+i" - test_pat_2 = "~! @!.two: #!.-s" + test_pat_1 = "~ @.one: #.+i" + test_pat_2 = "~ @.two: #.-s" cp_1 = self.prs.convert_line(test_pat_1) cp_2 = self.prs.convert_line(test_pat_2) @@ -357,10 +361,10 @@ def test_group_enclosures(self): pent.Content.Number: "#{0}..i", } - for c, i in itt.product(pent.Content, (True, False)): - t = pent.Token(token_fmt[c].format("!" if i else "")) - with self.subTest(testname_fmt.format(c, i)): - self.assertEqual(t.ignore, i) + for ct, cap in itt.product(pent.Content, (True, False)): + t = pent.Token(token_fmt[ct].format("!" if cap else "")) + with self.subTest(testname_fmt.format(ct, cap)): + self.assertEqual(t.capture, cap) def test_number_property(self): """Ensure t.number properties return correct values.""" @@ -410,7 +414,7 @@ def test_three_token_sequence(self): from .testdata import number_patterns as nps - pat_template = "~! {0} {1} {2} ~!" + pat_template = "~ {0} {1} {2} ~" str_template = "String! {0}{1}{2}{3}{4} More String!" str_pat = {"foo": "@{0}{1}{2}foo"} @@ -437,19 +441,19 @@ def test_three_token_sequence(self): v1 ].format( pent.parser._s_no_space if not s1 else "", - "", + pent.parser._s_capture, pent.Quantity.Single, ) p2 = (str_pat if c2 == pent.Content.String else nps)[ v2 ].format( pent.parser._s_no_space if not s2 else "", - "", + pent.parser._s_capture, pent.Quantity.Single, ) p3 = (str_pat if c3 == pent.Content.String else nps)[ v3 - ].format("", "", pent.Quantity.Single) + ].format("", pent.parser._s_capture, pent.Quantity.Single) test_pat = pat_template.format(p1, p2, p3) test_str = str_template.format( @@ -465,17 +469,17 @@ def test_three_token_sequence(self): self.assertIsNotNone(m, msg=test_pat) self.assertEqual( - m.group(pent.group_prefix + "1"), + m.group(pent.group_prefix + "0"), v1, msg=test_pat + " :: " + test_str, ) self.assertEqual( - m.group(pent.group_prefix + "2"), + m.group(pent.group_prefix + "1"), v2, msg=test_pat + " :: " + test_str, ) self.assertEqual( - m.group(pent.group_prefix + "3"), + m.group(pent.group_prefix + "2"), v3, msg=test_pat + " :: " + test_str, ) From 4dd2e1f97fbb7d5767ff1e5df69a7109a6c56d9a Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Mon, 17 Sep 2018 00:02:17 -0400 Subject: [PATCH 31/44] DEV: Disable Optional/ZeroOrMore modes Coping with the start-/end-of-word markers for the tokens where content may or may not be present is proving problematic. If content isn't actually there, then no wordification or following space should(?) be added to the pattern; but, if something IS there, then both should(?) be added, provided the relevant tokens are space-after tokens. Declaring explicit literals prior to these (to Optional, at least) doesn't really help anything, as the duplicated "[ \t]+" means that a single space won't be matched. May need to implement without these modes. SHOULD(?) be possible to still compose patterns for just about any data layout even without them? Might be more convoluted, but still possible... hopefully. --- pent/enums.py | 8 ++-- pent/parser.py | 10 +++++ pent/test/pent_base.py | 97 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 4 deletions(-) diff --git a/pent/enums.py b/pent/enums.py index 5143d23..d2569e2 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -78,14 +78,14 @@ class Quantity(str, Enum): #: Single value match Single = "." - #: Optional single value match - Optional = "?" + # ~ #: Optional single value match + # ~ Optional = "?" #: One-or-more match OneOrMore = "+" - #: Zero-or-more match - ZeroOrMore = "*" + # ~ #: Zero-or-more match + # ~ ZeroOrMore = "*" class TokenField(str, Enum): diff --git a/pent/parser.py b/pent/parser.py index 565210b..121cc52 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -265,7 +265,17 @@ def __attrs_post_init__(self): # Only single, non-optional captures implemented for now, regardless of # the Quantity flag in the token if self.is_str: + # Always store the string pattern self._pattern = self._string_pattern(self._pr[TokenField.Str]) + + # Modify, depending on the Quantity + # ~ if self.match_quantity is Quantity.Optional: + # ~ self._pattern = "(" + self._pattern + ")?" + if self.match_quantity is Quantity.OneOrMore: + self._pattern = "(" + self._pattern + ")+" + # ~ if self.match_quantity is Quantity.ZeroOrMore: + # ~ self._pattern = "(" + self._pattern + ")*" + elif self.is_num: self._pattern = self._get_number_pattern(self._pr) else: # pragma: no cover diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 169ae96..b4d8984 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -326,6 +326,103 @@ def test_any_token_capture_ranges(self): self.assertEqual(m.group(pent.group_prefix + "1"), test_num) self.assertEqual(m.group(pent.group_prefix + "2"), test_line_end) + @ut.skip("Developing without optional/zero-or-more for now") + def test_optional_str(self): + """Confirm single optional str token works as expected.""" + import pent + + test_string = "This is a test {} string." + test_pat = "~ @.test @{}?foo @x.string ~" + + for there, cap in itt.product(*itt.repeat((True, False), 2)): + with self.subTest("There: {0}, Cap: {1}".format(there, cap)): + pat = test_pat.format(pent.parser._s_capture if cap else "") + prs_pat = pent.Parser().convert_line(pat) + + work_str = test_string.format("foo" if there else "") + + m = re.search(prs_pat, work_str) + + self.assertIsNotNone(m) + if cap: + if there: + self.assertEqual( + "foo", + m.group(pent.group_prefix + "0"), + msg=work_str + pat, + ) + else: + self.assertEqual("", m.group(pent.group_prefix + "0")) + else: + self.assertRaises( + IndexError, m.group, pent.group_prefix + "0" + ) + + def test_one_or_more_str(self): + """Confirm one-or-more str token works as expected.""" + import pent + + test_string = "This is a test {} string." + test_pat = "~ @{}+foo ~" + + for qty, cap in itt.product((1, 2, 3), (True, False)): + with self.subTest("Qty: {0}, Cap: {1}".format(qty, cap)): + pat = test_pat.format(pent.parser._s_capture if cap else "") + pat = pent.Parser().convert_line(pat) + + work_str = test_string.format("foo" * qty) + + m = re.search(pat, work_str) + + self.assertIsNotNone(m) + if cap: + self.assertEqual( + "foo" * qty, m.group(pent.group_prefix + "0") + ) + else: + self.assertRaises( + IndexError, m.group, pent.group_prefix + "0" + ) + + @ut.skip("Skipping until resolve Optional.") + def test_zero_or_more_str(self): + """Confirm zero-or-more str token works as expected.""" + import pent + + test_string = "This is a test {}string." + test_pat = "~ @{}*foo ~" + + for qty, cap in itt.product((0, 1, 2, 3), (True, False)): + with self.subTest("Qty: {0}, Cap: {1}".format(qty, cap)): + pat = test_pat.format(pent.parser._s_capture if cap else "") + pat = pent.Parser().convert_line(pat) + + work_str = test_string.format("foo " * qty) + + m = re.search(pat, work_str) + + self.assertIsNotNone(m) + if cap: + self.assertEqual( + "foo " * qty, m.group(pent.group_prefix + "0") + ) + else: + self.assertRaises( + IndexError, m.group, pent.group_prefix + "0" + ) + + @ut.skip("Developing w/o optional/zero-or-more for now") + def test_one_or_more_doesnt_match_zero_reps(self): + """Confirm one-or-more str doesn't match if string isn't there.""" + import pent + + test_string = "This is a test string." + test_pat = "~ @.is @!?absolutely @.a ~" + + m = re.search(self.prs.convert_line(test_pat), test_string) + + self.assertEqual("", m.group(pent.group_prefix + "0")) + def test_manual_two_lines(self): """Run manual check on concatenating two single-line regexes.""" test_str = "This is line one: 12345 \nAnd this is line two: -3e-5" From 56a9ea351da6fcb822d5a5b9f002bd6311d56c49 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Mon, 17 Sep 2018 00:05:15 -0400 Subject: [PATCH 32/44] DEV: Add ability to adjust starting group_id In cases where multiple lines are needed in a "line" repeating pattern and group capture will occur, the ability to restart group id inlay at a custom value will be critical. Test is in place; functionality seems to be working, at least on a basic level. --- pent/parser.py | 8 ++++---- pent/test/pent_base.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pent/parser.py b/pent/parser.py index 121cc52..663414d 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -112,13 +112,16 @@ class Parser: """Mini-language parser for structured numerical data.""" @classmethod - def convert_line(cls, line, *, capture_groups=True): + def convert_line(cls, line, *, capture_groups=True, group_id=0): """Convert line of tokens to regex. The constructed regex is required to match the entirety of a line of text, using lookbehind and lookahead at the start and end of the pattern, respectively. + `group_id` indicates the starting value of the index for any + capture groups added. + """ import shlex @@ -132,9 +135,6 @@ def convert_line(cls, line, *, capture_groups=True): # Always have optional starting whitespace pattern += r"[ \t]*" - # Must initialize - group_id = 0 - # Initialize flag for a preceding no-space-after num token prior_no_space_token = False diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index b4d8984..ebda813 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -425,17 +425,23 @@ def test_one_or_more_doesnt_match_zero_reps(self): def test_manual_two_lines(self): """Run manual check on concatenating two single-line regexes.""" + import pent + test_str = "This is line one: 12345 \nAnd this is line two: -3e-5" - test_pat_1 = "~ @.one: #.+i" - test_pat_2 = "~ @.two: #.-s" + test_pat_1 = "~ @!.one: #!.+i" + test_pat_2 = "~ @!.two: #!.-s" cp_1 = self.prs.convert_line(test_pat_1) - cp_2 = self.prs.convert_line(test_pat_2) + cp_2 = self.prs.convert_line(test_pat_2, group_id=2) m = re.search(cp_1 + r"\n" + cp_2, test_str) self.assertIsNotNone(m) + self.assertEqual("one:", m.group(pent.group_prefix + "0")) + self.assertEqual("12345", m.group(pent.group_prefix + "1")) + self.assertEqual("two:", m.group(pent.group_prefix + "2")) + self.assertEqual("-3e-5", m.group(pent.group_prefix + "3")) class TestPentTokens(ut.TestCase, SuperPent): From 2c547c7df571140a3abf5981358a104942fcb56c Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Mon, 17 Sep 2018 12:32:44 -0400 Subject: [PATCH 33/44] DEV/TEST: Add OneOrMore number; add tests Basic one-or-more number token behavior seems to be working ok. Simple tests in place. Add test for literal token with contained space. Also seems working ok. This and prior work closes #16. --- pent/parser.py | 4 +++ pent/test/pent_base.py | 70 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/pent/parser.py b/pent/parser.py index 663414d..3c02bb5 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -278,6 +278,10 @@ def __attrs_post_init__(self): elif self.is_num: self._pattern = self._get_number_pattern(self._pr) + + if self.match_quantity is Quantity.OneOrMore: + self._pattern += r"([ \t]+{})*".format(self._pattern) + else: # pragma: no cover raise NotImplementedError( "Unknown content type somehow specified!" diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index ebda813..afd35b5 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -145,6 +145,7 @@ def test_string_capture(self): test_pat_capture = "~ @!.word ~" test_pat_ignore = "~ @.word ~" test_pat_symbol = "~ @!.[symbol] ~" + test_pat_with_space = "~ '@!.string with' ~" with self.subTest("capture"): pat = self.prs.convert_line(test_pat_capture) @@ -164,6 +165,12 @@ def test_string_capture(self): self.assertIsNotNone(m) self.assertEqual(m.group(pent.group_prefix + "0"), "[symbol]") + with self.subTest("with_space"): + pat = self.prs.convert_line(test_pat_with_space) + m = re.search(pat, test_line) + self.assertIsNotNone(m) + self.assertEqual(m.group(pent.group_prefix + "0"), "string with") + def test_single_num_capture(self): """Confirm single-number capture works.""" import pent @@ -358,8 +365,8 @@ def test_optional_str(self): IndexError, m.group, pent.group_prefix + "0" ) - def test_one_or_more_str(self): - """Confirm one-or-more str token works as expected.""" + def test_one_or_more_str_nospace(self): + """Confirm one-or-more str token works as expected w/no space.""" import pent test_string = "This is a test {} string." @@ -384,6 +391,32 @@ def test_one_or_more_str(self): IndexError, m.group, pent.group_prefix + "0" ) + def test_one_or_more_str_with_space(self): + """Confirm one-or-more str token works as expected w/space.""" + import pent + + test_string = "This is a test {}string." + test_pat = "~ '@x{}+foo ' ~" + + for qty, cap in itt.product((1, 2, 3), (True, False)): + with self.subTest("Qty: {0}, Cap: {1}".format(qty, cap)): + pat = test_pat.format(pent.parser._s_capture if cap else "") + pat = pent.Parser().convert_line(pat) + + work_str = test_string.format("foo " * qty) + + m = re.search(pat, work_str) + + self.assertIsNotNone(m) + if cap: + self.assertEqual( + "foo " * qty, m.group(pent.group_prefix + "0") + ) + else: + self.assertRaises( + IndexError, m.group, pent.group_prefix + "0" + ) + @ut.skip("Skipping until resolve Optional.") def test_zero_or_more_str(self): """Confirm zero-or-more str token works as expected.""" @@ -443,6 +476,33 @@ def test_manual_two_lines(self): self.assertEqual("two:", m.group(pent.group_prefix + "2")) self.assertEqual("-3e-5", m.group(pent.group_prefix + "3")) + def test_quick_one_or_more_number(self): + """Run quick check on capture of one-or-more number token.""" + import pent + + numbers = "2 5 -54 3.8 -1.e-12" + + test_str = "This has numbers {} with end space.".format(numbers) + test_str_period = "This has numbers {}.".format(numbers) + + test_pat = "~ #!+.g ~" + test_pat_period = "~ #x!+.g @.." + + re_pat = self.prs.convert_line(test_pat) + re_pat_period = self.prs.convert_line(test_pat_period) + + with self.subTest("end_space"): + m_pat = re.search(re_pat, test_str) + self.assertIsNotNone(m_pat) + self.assertEqual(m_pat.group(pent.group_prefix + "0"), numbers) + + with self.subTest("period"): + m_pat_period = re.search(re_pat_period, test_str_period) + self.assertIsNotNone(m_pat_period) + self.assertEqual( + m_pat_period.group(pent.group_prefix + "0"), numbers + ) + class TestPentTokens(ut.TestCase, SuperPent): """Direct tests on the Token class.""" @@ -503,6 +563,12 @@ def test_sign_property(self): with self.subTest("any"): self.assertEqual(pent.Token("~").sign, None) + def test_qty_property_on_any(self): + """Ensure t.match_quantity property returns correct value on 'any'.""" + import pent + + self.assertEqual(pent.Token("~").match_quantity, None) + class TestPentParserPatternsSlow(ut.TestCase, SuperPent): """SLOW tests confirming pattern matching of Parser regexes.""" From 97815e0d4137aadc5e8d1412cd6efefda348db32 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Mon, 17 Sep 2018 14:12:33 -0400 Subject: [PATCH 34/44] REFACTOR: Pull pyparsing patterns into Token No reason for it to be housed at the module level. Closes #20. --- pent/__init__.py | 3 +- pent/parser.py | 165 ++++++++++++++++++++--------------------- pent/test/pent_base.py | 112 ++++++++++++++++------------ 3 files changed, 149 insertions(+), 131 deletions(-) diff --git a/pent/__init__.py b/pent/__init__.py index fba5cfd..af079c4 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -38,7 +38,6 @@ "number_patterns", "wordify_pattern", "std_wordify", - "group_prefix", "PentError", "BadTokenError", ] @@ -46,7 +45,7 @@ from .enums import Number, Sign, TokenField from .enums import Content, Quantity from .errors import PentError, BadTokenError -from .parser import Parser, Token, group_prefix +from .parser import Parser, Token from .patterns import number_patterns, wordify_pattern, std_wordify diff --git a/pent/parser.py b/pent/parser.py index 3c02bb5..e91f3fa 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -33,80 +33,6 @@ from .patterns import std_wordify_open, std_wordify_close -# ## MINI-LANGUAGE PARSER DEFINITION ## - -# ## HELPERS ## -group_prefix = "g" -_s_any_flag = "~" -_s_capture = "!" -_s_no_space = "x" - -_pp_no_space = pp.Optional(pp.Literal(_s_no_space)).setResultsName( - TokenField.NoSpace -) -_pp_capture = pp.Optional(pp.Literal(_s_capture)).setResultsName( - TokenField.Capture -) -_pp_quantity = pp.Word("".join(Quantity), exact=1).setResultsName( - TokenField.Quantity -) - - -# ## ARBITRARY CONTENT ## -# Tilde says anything may be here, including multiple words -# Definitely want to give the option not to capture. Might ideally -# be the default NOT to capture here... -_pp_any_flag = ( - pp.Literal(_s_any_flag).setResultsName(TokenField.Type) + _pp_capture -) - -# ## LITERAL STRING ## -# Marker for the rest of the token to be a literal string -_pp_str_flag = pp.Literal(Content.String.value).setResultsName(TokenField.Type) - -# Remainder of the content after the marker, spaces included -_pp_str_value = pp.Word(pp.printables + " ").setResultsName(TokenField.Str) - -# Composite pattern for a literal string -_pp_string = ( - _pp_str_flag + _pp_no_space + _pp_capture + _pp_quantity + _pp_str_value -) - -# ## NUMERICAL VALUE ## -# Initial marker for a numerical value -_pp_num_flag = pp.Literal(Content.Number.value).setResultsName(TokenField.Type) - -# Marker for the sign of the value; period indicates either sign -_pp_num_sign = pp.Word("".join(Sign), exact=1).setResultsName(TokenField.Sign) - -# Marker for the number type to look for -_pp_num_type = pp.Word("".join(Number), exact=1).setResultsName( - TokenField.Number -) - -# Composite pattern for a number -_pp_number = ( - _pp_num_flag - + _pp_no_space - + _pp_capture - + _pp_quantity - + pp.Group(_pp_num_sign + _pp_num_type).setResultsName( - TokenField.SignNumber - ) -) - - -# ## COMBINED TOKEN PARSER ## -_pp_token = ( - pp.StringStart() - + (_pp_any_flag ^ _pp_string ^ _pp_number) - + pp.StringEnd() -) - - -# ## PARSER CLASS FOR EXTERNAL USE ## - - @attr.s(slots=True) class Parser: """Mini-language parser for structured numerical data.""" @@ -188,10 +114,87 @@ class Token: #: Flag for whether group ID substitution needs to be done needs_group_id = attr.ib(default=False, init=False, repr=False) - # Internal pyparsing result and generated regex pattern + # Internal pyparsing parse result and generated regex pattern _pr = attr.ib(default=None, init=False, repr=False) _pattern = attr.ib(default=None, init=False, repr=False) + # ##### pyparsing pattern internals ##### + + # ## MINOR PATTERN COMPONENTS ## + group_prefix = "g" + _s_any_flag = "~" + _s_capture = "!" + _s_no_space = "x" + + _pp_no_space = pp.Optional(pp.Literal(_s_no_space)).setResultsName( + TokenField.NoSpace + ) + _pp_capture = pp.Optional(pp.Literal(_s_capture)).setResultsName( + TokenField.Capture + ) + _pp_quantity = pp.Word("".join(Quantity), exact=1).setResultsName( + TokenField.Quantity + ) + + # ## ARBITRARY CONTENT TOKEN ## + # Anything may be matched here, including multiple words. + _pp_any_flag = ( + pp.Literal(_s_any_flag).setResultsName(TokenField.Type) + _pp_capture + ) + + # ## LITERAL STRING TOKEN ## + # Marker for the rest of the token to be a literal string + _pp_str_flag = pp.Literal(Content.String.value).setResultsName( + TokenField.Type + ) + + # Remainder of the content after the marker, spaces included + _pp_str_value = pp.Word(pp.printables + " ").setResultsName(TokenField.Str) + + # Composite pattern for a literal string + _pp_string = ( + _pp_str_flag + + _pp_no_space + + _pp_capture + + _pp_quantity + + _pp_str_value + ) + + # ## NUMERICAL VALUE TOKEN ## + # Initial marker for a numerical value + _pp_num_flag = pp.Literal(Content.Number.value).setResultsName( + TokenField.Type + ) + + # Marker for the sign of the value; period indicates either sign + _pp_num_sign = pp.Word("".join(Sign), exact=1).setResultsName( + TokenField.Sign + ) + + # Marker for the number type to look for + _pp_num_type = pp.Word("".join(Number), exact=1).setResultsName( + TokenField.Number + ) + + # Composite pattern for a number + _pp_number = ( + _pp_num_flag + + _pp_no_space + + _pp_capture + + _pp_quantity + + pp.Group(_pp_num_sign + _pp_num_type).setResultsName( + TokenField.SignNumber + ) + ) + + # ## COMBINED TOKEN PARSER ## + _pp_token = ( + pp.StringStart() + + (_pp_any_flag ^ _pp_string ^ _pp_number) + + pp.StringEnd() + ) + + # Informational properties @property def pattern(self): """Return assembled regex pattern from the token, as |str|.""" @@ -252,7 +255,7 @@ def capture(self): def __attrs_post_init__(self): """Handle automatic creation stuff.""" try: - self._pr = _pp_token.parseString(self.token) + self._pr = self._pp_token.parseString(self.token) except pp.ParseException as e: raise BadTokenError(self.token) from e @@ -269,12 +272,8 @@ def __attrs_post_init__(self): self._pattern = self._string_pattern(self._pr[TokenField.Str]) # Modify, depending on the Quantity - # ~ if self.match_quantity is Quantity.Optional: - # ~ self._pattern = "(" + self._pattern + ")?" if self.match_quantity is Quantity.OneOrMore: self._pattern = "(" + self._pattern + ")+" - # ~ if self.match_quantity is Quantity.ZeroOrMore: - # ~ self._pattern = "(" + self._pattern + ")*" elif self.is_num: self._pattern = self._get_number_pattern(self._pr) @@ -313,15 +312,15 @@ def _get_number_pattern(cls, parse_result): return cls._numpats[num, sign] - @staticmethod - def _group_open(): + @classmethod + def _group_open(cls): """Create the opening pattern for a named group. This leaves a formatting placeholder for the invoking Parser to inject the appropriate group ID. """ - return r"(?P<{0}{{0}}>".format(group_prefix) + return r"(?P<{0}{{0}}>".format(cls.group_prefix) @staticmethod def _group_close(): diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index afd35b5..b378734 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -151,25 +151,31 @@ def test_string_capture(self): pat = self.prs.convert_line(test_pat_capture) m = re.search(pat, test_line) self.assertIsNotNone(m) - self.assertEqual(m.group(pent.group_prefix + "0"), "word") + self.assertEqual(m.group(pent.Token.group_prefix + "0"), "word") with self.subTest("ignore"): pat = self.prs.convert_line(test_pat_ignore) m = re.search(pat, test_line) self.assertIsNotNone(m) - self.assertRaises(IndexError, m.group, pent.group_prefix + "0") + self.assertRaises( + IndexError, m.group, pent.Token.group_prefix + "0" + ) with self.subTest("symbol"): pat = self.prs.convert_line(test_pat_symbol) m = re.search(pat, test_line) self.assertIsNotNone(m) - self.assertEqual(m.group(pent.group_prefix + "0"), "[symbol]") + self.assertEqual( + m.group(pent.Token.group_prefix + "0"), "[symbol]" + ) with self.subTest("with_space"): pat = self.prs.convert_line(test_pat_with_space) m = re.search(pat, test_line) self.assertIsNotNone(m) - self.assertEqual(m.group(pent.group_prefix + "0"), "string with") + self.assertEqual( + m.group(pent.Token.group_prefix + "0"), "string with" + ) def test_single_num_capture(self): """Confirm single-number capture works.""" @@ -196,7 +202,9 @@ def test_single_num_capture(self): ) if m is not None: - self.assertEqual(m.group(pent.group_prefix + "0"), v) + self.assertEqual( + m.group(pent.Token.group_prefix + "0"), v + ) def test_single_nums_no_space(self): """Confirm two-number capture works, with no intervening space. @@ -215,8 +223,8 @@ def test_single_nums_no_space(self): m = re.search(npat, test_str) self.assertIsNotNone(m) - self.assertEqual(m.group(pent.group_prefix + "0"), "123") - self.assertEqual(m.group(pent.group_prefix + "1"), "-456") + self.assertEqual(m.group(pent.Token.group_prefix + "0"), "123") + self.assertEqual(m.group(pent.Token.group_prefix + "1"), "-456") def test_single_num_preceding_colon_capture(self): """Confirm single-number capture works, with preceding colon.""" @@ -243,7 +251,9 @@ def test_single_num_preceding_colon_capture(self): ) if m is not None: - self.assertEqual(m.group(pent.group_prefix + "0"), v) + self.assertEqual( + m.group(pent.Token.group_prefix + "0"), v + ) def test_string_and_single_num_capture(self): """Confirm multiple capture of string and single number.""" @@ -271,9 +281,11 @@ def test_string_and_single_num_capture(self): if m is not None: self.assertEqual( - m.group(pent.group_prefix + "0"), "string" + m.group(pent.Token.group_prefix + "0"), "string" + ) + self.assertEqual( + m.group(pent.Token.group_prefix + "1"), v ) - self.assertEqual(m.group(pent.group_prefix + "1"), v) def test_number_ending_sentence(self): """Check that a number at the end of a sentence matches correctly.""" @@ -286,8 +298,8 @@ def test_number_ending_sentence(self): for n in npats: token = npats[n].format( - pent.parser._s_no_space, - pent.parser._s_capture, + pent.Token._s_no_space, + pent.Token._s_capture, pent.Quantity.Single, ) with self.subTest(token): @@ -295,7 +307,7 @@ def test_number_ending_sentence(self): m = re.search(pat, test_line.format(n)) self.assertIsNotNone(m, msg=test_line.format(n) + token) - self.assertEqual(n, m.group(pent.group_prefix + "0")) + self.assertEqual(n, m.group(pent.Token.group_prefix + "0")) def test_match_entire_line(self): """Confirm the tilde works to match an entire line.""" @@ -308,14 +320,16 @@ def test_match_entire_line(self): self.assertTrue(self.does_parse_match(pat, test_line)) m = re.search(pat, test_line) - self.assertEqual(test_line, m.group(pent.group_prefix + "0")) + self.assertEqual(test_line, m.group(pent.Token.group_prefix + "0")) with self.subTest("no_capture"): pat = self.prs.convert_line("~") self.assertTrue(self.does_parse_match(pat, test_line)) m = re.search(pat, test_line) - self.assertRaises(IndexError, m.group, pent.group_prefix + "0") + self.assertRaises( + IndexError, m.group, pent.Token.group_prefix + "0" + ) def test_any_token_capture_ranges(self): """Confirm 'any' captures work as expected with other tokens.""" @@ -329,9 +343,11 @@ def test_any_token_capture_ranges(self): pat = pent.Parser().convert_line("~! @x.[ #x!..g @x.] ~!") m = re.search(pat, test_line) - self.assertEqual(m.group(pent.group_prefix + "0"), test_line_start) - self.assertEqual(m.group(pent.group_prefix + "1"), test_num) - self.assertEqual(m.group(pent.group_prefix + "2"), test_line_end) + self.assertEqual( + m.group(pent.Token.group_prefix + "0"), test_line_start + ) + self.assertEqual(m.group(pent.Token.group_prefix + "1"), test_num) + self.assertEqual(m.group(pent.Token.group_prefix + "2"), test_line_end) @ut.skip("Developing without optional/zero-or-more for now") def test_optional_str(self): @@ -343,7 +359,7 @@ def test_optional_str(self): for there, cap in itt.product(*itt.repeat((True, False), 2)): with self.subTest("There: {0}, Cap: {1}".format(there, cap)): - pat = test_pat.format(pent.parser._s_capture if cap else "") + pat = test_pat.format(pent.Token._s_capture if cap else "") prs_pat = pent.Parser().convert_line(pat) work_str = test_string.format("foo" if there else "") @@ -355,14 +371,16 @@ def test_optional_str(self): if there: self.assertEqual( "foo", - m.group(pent.group_prefix + "0"), + m.group(pent.Token.group_prefix + "0"), msg=work_str + pat, ) else: - self.assertEqual("", m.group(pent.group_prefix + "0")) + self.assertEqual( + "", m.group(pent.Token.group_prefix + "0") + ) else: self.assertRaises( - IndexError, m.group, pent.group_prefix + "0" + IndexError, m.group, pent.Token.group_prefix + "0" ) def test_one_or_more_str_nospace(self): @@ -374,7 +392,7 @@ def test_one_or_more_str_nospace(self): for qty, cap in itt.product((1, 2, 3), (True, False)): with self.subTest("Qty: {0}, Cap: {1}".format(qty, cap)): - pat = test_pat.format(pent.parser._s_capture if cap else "") + pat = test_pat.format(pent.Token._s_capture if cap else "") pat = pent.Parser().convert_line(pat) work_str = test_string.format("foo" * qty) @@ -384,11 +402,11 @@ def test_one_or_more_str_nospace(self): self.assertIsNotNone(m) if cap: self.assertEqual( - "foo" * qty, m.group(pent.group_prefix + "0") + "foo" * qty, m.group(pent.Token.group_prefix + "0") ) else: self.assertRaises( - IndexError, m.group, pent.group_prefix + "0" + IndexError, m.group, pent.Token.group_prefix + "0" ) def test_one_or_more_str_with_space(self): @@ -400,7 +418,7 @@ def test_one_or_more_str_with_space(self): for qty, cap in itt.product((1, 2, 3), (True, False)): with self.subTest("Qty: {0}, Cap: {1}".format(qty, cap)): - pat = test_pat.format(pent.parser._s_capture if cap else "") + pat = test_pat.format(pent.Token._s_capture if cap else "") pat = pent.Parser().convert_line(pat) work_str = test_string.format("foo " * qty) @@ -410,11 +428,11 @@ def test_one_or_more_str_with_space(self): self.assertIsNotNone(m) if cap: self.assertEqual( - "foo " * qty, m.group(pent.group_prefix + "0") + "foo " * qty, m.group(pent.Token.group_prefix + "0") ) else: self.assertRaises( - IndexError, m.group, pent.group_prefix + "0" + IndexError, m.group, pent.Token.group_prefix + "0" ) @ut.skip("Skipping until resolve Optional.") @@ -427,7 +445,7 @@ def test_zero_or_more_str(self): for qty, cap in itt.product((0, 1, 2, 3), (True, False)): with self.subTest("Qty: {0}, Cap: {1}".format(qty, cap)): - pat = test_pat.format(pent.parser._s_capture if cap else "") + pat = test_pat.format(pent.Token._s_capture if cap else "") pat = pent.Parser().convert_line(pat) work_str = test_string.format("foo " * qty) @@ -437,11 +455,11 @@ def test_zero_or_more_str(self): self.assertIsNotNone(m) if cap: self.assertEqual( - "foo " * qty, m.group(pent.group_prefix + "0") + "foo " * qty, m.group(pent.Token.group_prefix + "0") ) else: self.assertRaises( - IndexError, m.group, pent.group_prefix + "0" + IndexError, m.group, pent.Token.group_prefix + "0" ) @ut.skip("Developing w/o optional/zero-or-more for now") @@ -454,7 +472,7 @@ def test_one_or_more_doesnt_match_zero_reps(self): m = re.search(self.prs.convert_line(test_pat), test_string) - self.assertEqual("", m.group(pent.group_prefix + "0")) + self.assertEqual("", m.group(pent.Token.group_prefix + "0")) def test_manual_two_lines(self): """Run manual check on concatenating two single-line regexes.""" @@ -471,10 +489,10 @@ def test_manual_two_lines(self): m = re.search(cp_1 + r"\n" + cp_2, test_str) self.assertIsNotNone(m) - self.assertEqual("one:", m.group(pent.group_prefix + "0")) - self.assertEqual("12345", m.group(pent.group_prefix + "1")) - self.assertEqual("two:", m.group(pent.group_prefix + "2")) - self.assertEqual("-3e-5", m.group(pent.group_prefix + "3")) + self.assertEqual("one:", m.group(pent.Token.group_prefix + "0")) + self.assertEqual("12345", m.group(pent.Token.group_prefix + "1")) + self.assertEqual("two:", m.group(pent.Token.group_prefix + "2")) + self.assertEqual("-3e-5", m.group(pent.Token.group_prefix + "3")) def test_quick_one_or_more_number(self): """Run quick check on capture of one-or-more number token.""" @@ -494,13 +512,15 @@ def test_quick_one_or_more_number(self): with self.subTest("end_space"): m_pat = re.search(re_pat, test_str) self.assertIsNotNone(m_pat) - self.assertEqual(m_pat.group(pent.group_prefix + "0"), numbers) + self.assertEqual( + m_pat.group(pent.Token.group_prefix + "0"), numbers + ) with self.subTest("period"): m_pat_period = re.search(re_pat_period, test_str_period) self.assertIsNotNone(m_pat_period) self.assertEqual( - m_pat_period.group(pent.group_prefix + "0"), numbers + m_pat_period.group(pent.Token.group_prefix + "0"), numbers ) @@ -609,20 +629,20 @@ def test_three_token_sequence(self): p1 = (str_pat if c1 == pent.Content.String else nps)[ v1 ].format( - pent.parser._s_no_space if not s1 else "", - pent.parser._s_capture, + pent.Token._s_no_space if not s1 else "", + pent.Token._s_capture, pent.Quantity.Single, ) p2 = (str_pat if c2 == pent.Content.String else nps)[ v2 ].format( - pent.parser._s_no_space if not s2 else "", - pent.parser._s_capture, + pent.Token._s_no_space if not s2 else "", + pent.Token._s_capture, pent.Quantity.Single, ) p3 = (str_pat if c3 == pent.Content.String else nps)[ v3 - ].format("", pent.parser._s_capture, pent.Quantity.Single) + ].format("", pent.Token._s_capture, pent.Quantity.Single) test_pat = pat_template.format(p1, p2, p3) test_str = str_template.format( @@ -638,17 +658,17 @@ def test_three_token_sequence(self): self.assertIsNotNone(m, msg=test_pat) self.assertEqual( - m.group(pent.group_prefix + "0"), + m.group(pent.Token.group_prefix + "0"), v1, msg=test_pat + " :: " + test_str, ) self.assertEqual( - m.group(pent.group_prefix + "1"), + m.group(pent.Token.group_prefix + "1"), v2, msg=test_pat + " :: " + test_str, ) self.assertEqual( - m.group(pent.group_prefix + "2"), + m.group(pent.Token.group_prefix + "2"), v3, msg=test_pat + " :: " + test_str, ) From 05da68c54564b8e89e19a3bc1e8bfc456f04f94d Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Wed, 19 Sep 2018 01:07:45 -0400 Subject: [PATCH 35/44] TDD: Add 1-D parser test; partial implement Add an example ORCA .hess file, for use in testing the higher-level Parser functionality. Modify Parser.convert_line to also return the value of the group_id, as it will be necessary to increment and re-pass this value when building with-capture patterns for head/tail containing multiple lines. (This necessitates [0]-indexing the return value from .convert_line if all you care about is the regex pattern.) Implement a first pass on the total-pattern construction @property of Parser. Added test seems to confirm that the 1-D construction works properly. Remains to be seen if the 2-D construction also works (#34). Should possibly try to develop at least a toy 3-D dataset and see how it parses (#35). For both 1-D and 2-D (and generalized 3-D .. n-D), need to implement the structured data retrieval and cross-check against sample data (#36). Lots of sample data. --- pent/errors.py | 12 +++ pent/parser.py | 69 ++++++++++++- pent/test/C2F4_01.hess | 217 +++++++++++++++++++++++++++++++++++++++++ pent/test/pent_base.py | 81 ++++++++++----- pent/test/testdata.py | 30 ++++++ 5 files changed, 383 insertions(+), 26 deletions(-) create mode 100644 pent/test/C2F4_01.hess diff --git a/pent/errors.py b/pent/errors.py index 3d5ebba..6421fc8 100644 --- a/pent/errors.py +++ b/pent/errors.py @@ -43,5 +43,17 @@ def __str__(self): return "'{}' is an invalid pent token".format(self.token) +class BadSectionError(PentError): # pragma: no cover + """Raised from failed attempts to parse a Parser section.""" + + def __init__(self, msg=""): + """Instantiate a ``BadSectionError``.""" + self.msg = msg + + def __str__(self): + """Generate a more-informative error message.""" + return "Bad Parser section: {}".format(self.msg) + + if __name__ == "__main__": # pragma: no cover print("Module not executable.") diff --git a/pent/parser.py b/pent/parser.py index e91f3fa..a49e5d4 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -29,7 +29,7 @@ from .enums import Number, Sign, TokenField from .enums import Content, Quantity -from .errors import BadTokenError +from .errors import BadTokenError, BadSectionError from .patterns import std_wordify_open, std_wordify_close @@ -37,6 +37,71 @@ class Parser: """Mini-language parser for structured numerical data.""" + head = attr.ib(default=None) + body = attr.ib(default=None) + tail = attr.ib(default=None) + + @property + def pattern(self): + """Return the regex pattern for the entire parser. + + The capture groups are NEVER inserted when regex is + generated this way. + + """ + # Relies on the convert_section default for 'capture_groups' + # as False. + rx_head, rx_body, rx_tail = map( + self.convert_section, (self.head, self.body, self.tail) + ) + + rx = "" + + if rx_head: + rx += rx_head + "\n" + + try: + # At least one line of the body, followed by however many more + rx += rx_body + "(\n" + rx_body + ")*" + except TypeError as e: + raise BadSectionError( + "'body' required to generate 'pattern'" + ) from e + + if rx_tail: + rx += "\n" + rx_tail + + return rx + + @classmethod + def convert_section(cls, sec, capture_groups=False): + """Convert the head, body or tail to regex.""" + # Could be None + if sec is None: + return None + + # If it's a Parser + try: + return sec.pattern + except AttributeError: + pass + + # If it's a single line + try: + return cls.convert_line(sec, capture_groups=capture_groups)[0] + except AttributeError: + pass + + # If it's an iterable of lines + try: + return "\n".join( + cls.convert_line(_, capture_groups=False)[0] for _ in sec + ) + except AttributeError: + # Most likely is that the iterable members don't have + # the .pattern attribute + raise BadSectionError("Unrecognized format") + @classmethod def convert_line(cls, line, *, capture_groups=True, group_id=0): """Convert line of tokens to regex. @@ -96,7 +161,7 @@ def convert_line(cls, line, *, capture_groups=True, group_id=0): # Always put possible whitespace to the end of the line pattern += r"[ \t]*($|(?=\n))" - return pattern + return pattern, group_id @attr.s(slots=True) diff --git a/pent/test/C2F4_01.hess b/pent/test/C2F4_01.hess new file mode 100644 index 0000000..4950774 --- /dev/null +++ b/pent/test/C2F4_01.hess @@ -0,0 +1,217 @@ + +$orca_hessian_file + +$act_atom + 0 + +$act_coord + 0 + +$act_energy + 0.000000 + +$hessian +18 + 0 1 2 3 4 5 + 0 0.468819 -0.006771 0.020586 -0.382690 0.017874 -0.054490 + 1 -0.006719 0.022602 -0.016183 0.010997 -0.033397 0.014422 + 2 0.020559 -0.016184 0.066859 -0.033601 0.014417 -0.072836 + 3 -0.383124 0.011028 -0.033603 0.713881 -0.050342 0.153741 + 4 0.017855 -0.033507 0.014435 -0.050333 0.166681 -0.235792 + 5 -0.054469 0.014441 -0.072841 0.153750 -0.235799 0.809241 + 6 -0.044553 -0.015013 0.045825 -0.106635 -0.034806 0.106260 + 7 -0.010928 0.003343 0.001126 -0.028196 -0.059985 0.096280 + 8 0.033360 0.001126 0.000270 0.085486 0.095883 -0.322046 + 9 -0.037205 0.008796 -0.026874 -0.187901 0.070933 -0.216625 + 10 -0.005997 0.001355 -0.001918 0.070927 -0.071900 0.117668 + 11 0.018298 -0.001928 0.006587 -0.216636 0.117683 -0.392551 + 12 -0.015040 0.001637 -0.004988 -0.037205 -0.005998 0.018298 + 13 0.001637 0.008854 0.004440 0.008796 0.001355 -0.001928 + 14 -0.004988 0.004440 -0.003248 -0.026874 -0.001918 0.006587 + 15 0.010667 0.000300 -0.000920 0.001123 0.002441 -0.007439 + 16 0.004184 -0.002756 -0.001880 -0.012362 -0.002357 0.009410 + 17 -0.012766 -0.001878 0.002370 0.037730 0.009415 -0.028028 + 6 7 8 9 10 11 + 0 -0.044552 -0.010926 0.033360 -0.037204 -0.005998 0.018298 + 1 -0.015010 0.003340 0.001128 0.008796 0.001355 -0.001928 + 2 0.045825 0.001126 0.000268 -0.026874 -0.001918 0.006587 + 3 -0.106773 -0.028053 0.085632 -0.187901 0.070933 -0.216624 + 4 -0.034894 -0.060273 0.096192 0.070927 -0.071900 0.117668 + 5 0.106497 0.096194 -0.322409 -0.216636 0.117683 -0.392551 + 6 0.142030 0.047505 -0.145027 0.001122 0.002441 -0.007439 + 7 0.047730 0.054104 -0.113046 -0.012362 -0.002357 0.009410 + 8 -0.145110 -0.112659 0.361888 0.037730 0.009415 -0.028028 + 9 0.001122 -0.012362 0.037730 0.713879 -0.050342 0.153740 + 10 0.002441 -0.002357 0.009415 -0.050332 0.166681 -0.235793 + 11 -0.007439 0.009410 -0.028028 0.153749 -0.235800 0.809242 + 12 0.010667 0.004184 -0.012766 -0.382689 0.017873 -0.054489 + 13 0.000300 -0.002755 -0.001878 0.010996 -0.033397 0.014421 + 14 -0.000921 -0.001881 0.002370 -0.033600 0.014417 -0.072835 + 15 -0.002632 -0.000428 0.001300 -0.106634 -0.034805 0.106259 + 16 -0.000428 0.007650 0.008112 -0.028196 -0.059985 0.096280 + 17 0.001301 0.008112 -0.014453 0.085485 0.095884 -0.322047 + 12 13 14 15 16 17 + 0 -0.015040 0.001637 -0.004988 0.010667 0.004184 -0.012766 + 1 0.001637 0.008854 0.004440 0.000300 -0.002755 -0.001878 + 2 -0.004988 0.004440 -0.003248 -0.000920 -0.001881 0.002370 + 3 -0.037205 0.008796 -0.026874 0.001123 -0.012362 0.037729 + 4 -0.005998 0.001355 -0.001918 0.002441 -0.002357 0.009415 + 5 0.018298 -0.001928 0.006587 -0.007439 0.009409 -0.028028 + 6 0.010667 0.000300 -0.000920 -0.002632 -0.000428 0.001301 + 7 0.004184 -0.002756 -0.001880 -0.000428 0.007650 0.008112 + 8 -0.012766 -0.001878 0.002370 0.001300 0.008112 -0.014453 + 9 -0.383123 0.011028 -0.033602 -0.106772 -0.028053 0.085631 + 10 0.017855 -0.033506 0.014435 -0.034893 -0.060272 0.096192 + 11 -0.054467 0.014441 -0.072841 0.106496 0.096194 -0.322411 + 12 0.468818 -0.006770 0.020584 -0.044552 -0.010926 0.033360 + 13 -0.006719 0.022602 -0.016183 -0.015010 0.003340 0.001128 + 14 0.020557 -0.016184 0.066859 0.045825 0.001126 0.000268 + 15 -0.044552 -0.015013 0.045825 0.142029 0.047505 -0.145026 + 16 -0.010928 0.003343 0.001126 0.047730 0.054104 -0.113047 + 17 0.033360 0.001126 0.000269 -0.145110 -0.112660 0.361889 + +$vibrational_frequencies +18 + 0 0.000000 + 1 0.000000 + 2 0.000000 + 3 0.000000 + 4 0.000000 + 5 0.000000 + 6 194.490162 + 7 198.587114 + 8 389.931897 + 9 402.713910 + 10 538.244274 + 11 542.017838 + 12 548.246738 + 13 800.613516 + 14 1203.096114 + 15 1342.200360 + 16 1349.543713 + 17 1885.157022 + +$normal_modes +18 18 + 0 1 2 3 4 5 + 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 3 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 4 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 5 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 6 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 7 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 8 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 9 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 10 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 11 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 12 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 13 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 14 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 15 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 16 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 17 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 + 6 7 8 9 10 11 + 0 0.015973 -0.171527 -0.032796 0.000132 0.017709 0.389831 + 1 -0.485423 0.092995 0.154817 -0.194511 0.096798 -0.065773 + 2 -0.116385 -0.427216 -0.472184 -0.063279 0.041313 0.179270 + 3 0.019428 -0.208038 -0.031784 0.000158 0.013663 0.298629 + 4 0.002703 -0.038155 0.015704 0.613517 -0.653783 0.097714 + 5 -0.011363 0.118773 -0.050883 0.200766 -0.222749 -0.156459 + 6 -0.028257 0.303052 -0.422990 -0.000246 0.001098 0.015318 + 7 0.483717 -0.068872 0.082042 -0.193364 0.105754 0.128621 + 8 0.123568 0.352124 -0.250003 -0.063673 0.013971 -0.414226 + 9 0.019428 -0.208037 0.031785 0.000160 -0.013658 -0.298607 + 10 0.002700 -0.038154 -0.015754 0.613525 0.653772 -0.097724 + 11 -0.011363 0.118773 0.050867 0.200772 0.222755 0.156494 + 12 0.015973 -0.171529 0.032799 0.000135 -0.017700 -0.389796 + 13 -0.485425 0.092993 -0.154801 -0.194521 -0.096793 0.065784 + 14 -0.116384 -0.427212 0.472193 -0.063252 -0.041321 -0.179306 + 15 -0.028256 0.303057 0.422986 -0.000222 -0.001110 -0.015367 + 16 0.483715 -0.068872 -0.082027 -0.193371 -0.105752 -0.128627 + 17 0.123569 0.352123 0.250005 -0.063659 -0.013966 0.414239 + 12 13 14 15 16 17 + 0 -0.308045 -0.481714 -0.266688 0.190233 -0.238363 0.084679 + 1 -0.096767 -0.012185 0.018156 0.003592 0.004871 -0.006810 + 2 0.295460 0.037100 -0.055486 -0.010984 -0.014761 0.020766 + 3 -0.175864 -0.093363 0.304674 -0.560560 0.532760 -0.358174 + 4 0.091172 0.048626 -0.158096 -0.104721 0.099454 0.185833 + 5 -0.278674 -0.147763 0.482800 0.319591 -0.303669 -0.567350 + 6 0.419267 0.261191 0.074065 0.099721 -0.098459 -0.020570 + 7 0.039129 0.126520 0.081796 0.050569 -0.067748 -0.026457 + 8 -0.119298 -0.386372 -0.249752 -0.154336 0.206747 0.080748 + 9 -0.175902 0.093363 0.304685 0.560581 0.532732 0.358172 + 10 0.091188 -0.048627 -0.158094 0.104729 0.099448 -0.185834 + 11 -0.278644 0.147763 0.482794 -0.319614 -0.303651 0.567351 + 12 -0.308092 0.481712 -0.266691 -0.190241 -0.238354 -0.084678 + 13 -0.096764 0.012186 0.018155 -0.003593 0.004871 0.006810 + 14 0.295438 -0.037102 -0.055484 0.010985 -0.014761 -0.020765 + 15 0.419265 -0.261190 0.074063 -0.099726 -0.098453 0.020569 + 16 0.039109 -0.126520 0.081795 -0.050574 -0.067745 0.026457 + 17 -0.119250 0.386375 -0.249750 0.154350 0.206738 -0.080748 + +# +# The atoms: label mass x y z +# +$atoms +6 + F 18.9980 1.895469 -0.009515 0.029153 + C 12.0110 4.368360 -0.039917 0.121590 + F 18.9980 5.446480 0.653585 -1.995331 + C 12.0110 5.657137 -0.708447 2.163161 + F 18.9980 8.130029 -0.738844 2.255584 + F 18.9980 4.579028 -1.401950 4.280087 + +$actual_temperature + 0.000000 + +$dipole_derivatives +18 + -1.041194 0.077425 -0.236446 + 0.021347 -0.053665 0.064080 + -0.064917 0.064030 -0.228121 + 1.334075 0.025554 -0.077999 + 0.025455 0.179824 -0.349540 + -0.078013 -0.349384 1.132034 + -0.292885 -0.102966 0.314406 + -0.046806 -0.126137 0.285463 + 0.142946 0.285361 -0.903922 + 1.334072 0.025555 -0.078001 + 0.025456 0.179823 -0.349540 + -0.078015 -0.349384 1.132038 + -1.041194 0.077424 -0.236443 + 0.021345 -0.053666 0.064079 + -0.064912 0.064030 -0.228121 + -0.292884 -0.102965 0.314403 + -0.046806 -0.126136 0.285464 + 0.142945 0.285362 -0.903926 + +# +# The IR spectrum +# wavenumber T**2 TX TY TY +# +$ir_spectrum +18 + 0.00 0.0000 0.0000 0.0000 0.0000 + 0.00 0.0000 0.0000 0.0000 0.0000 + 0.00 0.0000 0.0000 0.0000 0.0000 + 0.00 0.0000 0.0000 0.0000 0.0000 + 0.00 0.0000 0.0000 0.0000 0.0000 + 0.00 0.0000 0.0000 0.0000 0.0000 + 194.49 0.0322 0.1538 0.0279 -0.0882 + 198.59 3.8340 -1.6796 -0.3125 0.9568 + 389.93 0.0000 0.0000 -0.0000 -0.0000 + 402.71 0.9119 0.0000 0.9089 0.2929 + 538.24 0.0000 0.0000 -0.0000 0.0000 + 542.02 0.0000 0.0001 -0.0000 0.0001 + 548.25 2.3612 -0.7904 0.4103 -1.2522 + 800.61 0.0000 0.0000 -0.0000 -0.0000 + 1203.10 358.1939 9.7343 -5.0519 15.4245 + 1342.20 0.0000 0.0003 0.0001 -0.0004 + 1349.54 427.8394 17.7383 3.3117 -10.1107 + 1885.16 0.0000 -0.0000 0.0000 0.0000 + + +$end + diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index b378734..ab79b97 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -28,10 +28,15 @@ import itertools as itt +from pathlib import Path import re import unittest as ut +# HELPERS +testdir_path = Path() / "pent" / "test" + + class SuperPent: """Superclass of various test classes, with common methods.""" @@ -108,7 +113,7 @@ def test_group_tags_or_not(self): test_name = "{0}_{1}".format(content, capture) with self.subTest(test_name): test_pat = patterns[content].format("!" if capture else "") - test_rx = self.prs.convert_line(test_pat) + test_rx = self.prs.convert_line(test_pat)[0] self.assertEqual(capture, "(?P<" in test_rx, msg=test_pat) def test_parser_single_line_space_delim(self): @@ -131,7 +136,7 @@ def test_parser_single_line_space_delim(self): test_pat = test_pat_template.format(s.value, n.value) with self.subTest(self.make_testname(v, n, s)): - npat = self.prs.convert_line(test_pat) + npat = self.prs.convert_line(test_pat)[0] res = self.does_parse_match(npat, test_str) @@ -148,13 +153,13 @@ def test_string_capture(self): test_pat_with_space = "~ '@!.string with' ~" with self.subTest("capture"): - pat = self.prs.convert_line(test_pat_capture) + pat = self.prs.convert_line(test_pat_capture)[0] m = re.search(pat, test_line) self.assertIsNotNone(m) self.assertEqual(m.group(pent.Token.group_prefix + "0"), "word") with self.subTest("ignore"): - pat = self.prs.convert_line(test_pat_ignore) + pat = self.prs.convert_line(test_pat_ignore)[0] m = re.search(pat, test_line) self.assertIsNotNone(m) self.assertRaises( @@ -162,7 +167,7 @@ def test_string_capture(self): ) with self.subTest("symbol"): - pat = self.prs.convert_line(test_pat_symbol) + pat = self.prs.convert_line(test_pat_symbol)[0] m = re.search(pat, test_line) self.assertIsNotNone(m) self.assertEqual( @@ -170,7 +175,7 @@ def test_string_capture(self): ) with self.subTest("with_space"): - pat = self.prs.convert_line(test_pat_with_space) + pat = self.prs.convert_line(test_pat_with_space)[0] m = re.search(pat, test_line) self.assertIsNotNone(m) self.assertEqual( @@ -193,7 +198,7 @@ def test_single_num_capture(self): test_pat = test_pat_template.format(s.value, n.value) with self.subTest(self.make_testname(v, n, s)): - npat = self.prs.convert_line(test_pat) + npat = self.prs.convert_line(test_pat)[0] m = re.search(npat, test_str) @@ -218,7 +223,7 @@ def test_single_nums_no_space(self): test_str = "This is a string with 123-456 in it." test_pat = "~ #x!.+i #!.-i ~" - npat = self.prs.convert_line(test_pat) + npat = self.prs.convert_line(test_pat)[0] m = re.search(npat, test_str) @@ -242,7 +247,7 @@ def test_single_num_preceding_colon_capture(self): test_pat = test_pat_template.format(s.value, n.value) with self.subTest(self.make_testname(v, n, s)): - npat = self.prs.convert_line(test_pat) + npat = self.prs.convert_line(test_pat)[0] m = re.search(npat, test_str) @@ -271,7 +276,7 @@ def test_string_and_single_num_capture(self): test_pat = test_pat_template.format(s.value, n.value) with self.subTest(self.make_testname(v, n, s)): - npat = self.prs.convert_line(test_pat) + npat = self.prs.convert_line(test_pat)[0] m = re.search(npat, test_str) @@ -303,7 +308,7 @@ def test_number_ending_sentence(self): pent.Quantity.Single, ) with self.subTest(token): - pat = self.prs.convert_line(test_pat.format(token)) + pat = self.prs.convert_line(test_pat.format(token))[0] m = re.search(pat, test_line.format(n)) self.assertIsNotNone(m, msg=test_line.format(n) + token) @@ -316,14 +321,14 @@ def test_match_entire_line(self): test_line = "This is a line with whatever weird (*#$(*&23646{}}{#$" with self.subTest("capture"): - pat = self.prs.convert_line("~!") + pat = self.prs.convert_line("~!")[0] self.assertTrue(self.does_parse_match(pat, test_line)) m = re.search(pat, test_line) self.assertEqual(test_line, m.group(pent.Token.group_prefix + "0")) with self.subTest("no_capture"): - pat = self.prs.convert_line("~") + pat = self.prs.convert_line("~")[0] self.assertTrue(self.does_parse_match(pat, test_line)) m = re.search(pat, test_line) @@ -340,7 +345,7 @@ def test_any_token_capture_ranges(self): test_num = "2e-4" test_line = test_line_start + "[" + test_num + "]" + test_line_end - pat = pent.Parser().convert_line("~! @x.[ #x!..g @x.] ~!") + pat = pent.Parser().convert_line("~! @x.[ #x!..g @x.] ~!")[0] m = re.search(pat, test_line) self.assertEqual( @@ -360,7 +365,7 @@ def test_optional_str(self): for there, cap in itt.product(*itt.repeat((True, False), 2)): with self.subTest("There: {0}, Cap: {1}".format(there, cap)): pat = test_pat.format(pent.Token._s_capture if cap else "") - prs_pat = pent.Parser().convert_line(pat) + prs_pat = pent.Parser().convert_line(pat)[0] work_str = test_string.format("foo" if there else "") @@ -393,7 +398,7 @@ def test_one_or_more_str_nospace(self): for qty, cap in itt.product((1, 2, 3), (True, False)): with self.subTest("Qty: {0}, Cap: {1}".format(qty, cap)): pat = test_pat.format(pent.Token._s_capture if cap else "") - pat = pent.Parser().convert_line(pat) + pat = pent.Parser().convert_line(pat)[0] work_str = test_string.format("foo" * qty) @@ -419,7 +424,7 @@ def test_one_or_more_str_with_space(self): for qty, cap in itt.product((1, 2, 3), (True, False)): with self.subTest("Qty: {0}, Cap: {1}".format(qty, cap)): pat = test_pat.format(pent.Token._s_capture if cap else "") - pat = pent.Parser().convert_line(pat) + pat = pent.Parser().convert_line(pat)[0] work_str = test_string.format("foo " * qty) @@ -446,7 +451,7 @@ def test_zero_or_more_str(self): for qty, cap in itt.product((0, 1, 2, 3), (True, False)): with self.subTest("Qty: {0}, Cap: {1}".format(qty, cap)): pat = test_pat.format(pent.Token._s_capture if cap else "") - pat = pent.Parser().convert_line(pat) + pat = pent.Parser().convert_line(pat)[0] work_str = test_string.format("foo " * qty) @@ -470,7 +475,7 @@ def test_one_or_more_doesnt_match_zero_reps(self): test_string = "This is a test string." test_pat = "~ @.is @!?absolutely @.a ~" - m = re.search(self.prs.convert_line(test_pat), test_string) + m = re.search(self.prs.convert_line(test_pat)[0], test_string) self.assertEqual("", m.group(pent.Token.group_prefix + "0")) @@ -483,8 +488,8 @@ def test_manual_two_lines(self): test_pat_1 = "~ @!.one: #!.+i" test_pat_2 = "~ @!.two: #!.-s" - cp_1 = self.prs.convert_line(test_pat_1) - cp_2 = self.prs.convert_line(test_pat_2, group_id=2) + cp_1 = self.prs.convert_line(test_pat_1)[0] + cp_2 = self.prs.convert_line(test_pat_2, group_id=2)[0] m = re.search(cp_1 + r"\n" + cp_2, test_str) @@ -506,8 +511,8 @@ def test_quick_one_or_more_number(self): test_pat = "~ #!+.g ~" test_pat_period = "~ #x!+.g @.." - re_pat = self.prs.convert_line(test_pat) - re_pat_period = self.prs.convert_line(test_pat_period) + re_pat = self.prs.convert_line(test_pat)[0] + re_pat_period = self.prs.convert_line(test_pat_period)[0] with self.subTest("end_space"): m_pat = re.search(re_pat, test_str) @@ -523,6 +528,34 @@ def test_quick_one_or_more_number(self): m_pat_period.group(pent.Token.group_prefix + "0"), numbers ) + def test_orca_hess_freq_parser(self): + """Confirm 1-D data parser for ORCA freqs works.""" + import pent + + # from .testdata import orca_hess_freqs + + head_pattern = ("@.$vibrational_frequencies", "#.+i") + body_pattern = "#.+i #!..f" + + # Trivial application of the tail, but serves to check that + # it works correctly. + tail_pattern = ("~", "@.$normal_modes", "#++i") + + file_path = str(testdir_path / "C2F4_01.hess") + + freq_parser = pent.Parser( + head=head_pattern, body=body_pattern, tail=tail_pattern + ) + + with open(file_path) as f: + data = f.read() + + m = re.search(freq_parser.pattern, data) + self.assertIsNotNone(m) + self.assertEqual(m.group(0).count("\n"), 22) + + # ... more stuff here + class TestPentTokens(ut.TestCase, SuperPent): """Direct tests on the Token class.""" @@ -652,7 +685,7 @@ def test_three_token_sequence(self): with self.subTest( testname_template.format(v1, s1, v2, s2, v3) ): - npat = self.prs.convert_line(test_pat) + npat = self.prs.convert_line(test_pat)[0] m = re.search(npat, test_str) diff --git a/pent/test/testdata.py b/pent/test/testdata.py index a3fa8d8..d18da38 100644 --- a/pent/test/testdata.py +++ b/pent/test/testdata.py @@ -29,6 +29,33 @@ from pent import Number, Sign + +# ## RESULTS FROM THE ORCA HESS FILE ## + +orca_hess_freqs = [ + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 0.000000, + 194.490162, + 198.587114, + 389.931897, + 402.713910, + 538.244274, + 542.017838, + 548.246738, + 800.613516, + 1203.096114, + 1342.200360, + 1349.543713, + 1885.157022, +] + + +# ## SAMPLE NUMBERS AND MATCHING TOKENS ## + number_token_template = "#{{0}}{{1}}{{2}}{0}" number_patterns = { @@ -51,6 +78,9 @@ assert len(number_patterns) == 15 + +# ## EXHAUSTIVE MATCHES OF NUMBERS TO NUMBER/SIGN TYPES ## + number_sign_vals = { "0": { (Number.Integer, Sign.Positive): True, From 00a030c53ffe19578aa81cc6f5e4beb330223edb Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Wed, 19 Sep 2018 12:23:59 -0400 Subject: [PATCH 36/44] REFAC: Remove leading 'Bad' from custom errors Wouldn't be raising an exception if something bad hadn't happened. --- pent/__init__.py | 5 +++-- pent/errors.py | 8 ++++---- pent/parser.py | 10 ++++------ pent/test/pent_base.py | 2 +- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pent/__init__.py b/pent/__init__.py index af079c4..632ec33 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -39,12 +39,13 @@ "wordify_pattern", "std_wordify", "PentError", - "BadTokenError", + "TokenError", + "SectionError", ] from .enums import Number, Sign, TokenField from .enums import Content, Quantity -from .errors import PentError, BadTokenError +from .errors import PentError, TokenError, SectionError from .parser import Parser, Token from .patterns import number_patterns, wordify_pattern, std_wordify diff --git a/pent/errors.py b/pent/errors.py index 6421fc8..6db135a 100644 --- a/pent/errors.py +++ b/pent/errors.py @@ -31,11 +31,11 @@ class PentError(Exception): # pragma: no cover pass -class BadTokenError(PentError): # pragma: no cover +class TokenError(PentError): # pragma: no cover """Raised during attempts to parse an invalid token.""" def __init__(self, token): - """Instantiate a ``BadTokenError``.""" + """Instantiate a ``TokenError``.""" self.token = token def __str__(self): @@ -43,11 +43,11 @@ def __str__(self): return "'{}' is an invalid pent token".format(self.token) -class BadSectionError(PentError): # pragma: no cover +class SectionError(PentError): # pragma: no cover """Raised from failed attempts to parse a Parser section.""" def __init__(self, msg=""): - """Instantiate a ``BadSectionError``.""" + """Instantiate a ``SectionError``.""" self.msg = msg def __str__(self): diff --git a/pent/parser.py b/pent/parser.py index a49e5d4..43cadae 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -29,7 +29,7 @@ from .enums import Number, Sign, TokenField from .enums import Content, Quantity -from .errors import BadTokenError, BadSectionError +from .errors import TokenError, SectionError from .patterns import std_wordify_open, std_wordify_close @@ -64,9 +64,7 @@ def pattern(self): # At least one line of the body, followed by however many more rx += rx_body + "(\n" + rx_body + ")*" except TypeError as e: - raise BadSectionError( - "'body' required to generate 'pattern'" - ) from e + raise SectionError("'body' required to generate 'pattern'") from e if rx_tail: rx += "\n" + rx_tail @@ -100,7 +98,7 @@ def convert_section(cls, sec, capture_groups=False): except AttributeError: # Most likely is that the iterable members don't have # the .pattern attribute - raise BadSectionError("Unrecognized format") + raise SectionError("Unrecognized format") @classmethod def convert_line(cls, line, *, capture_groups=True, group_id=0): @@ -322,7 +320,7 @@ def __attrs_post_init__(self): try: self._pr = self._pp_token.parseString(self.token) except pp.ParseException as e: - raise BadTokenError(self.token) from e + raise TokenError(self.token) from e if self.is_any: self._pattern, self.needs_group_id = self._selective_group_enclose( diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index ab79b97..3226a9b 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -564,7 +564,7 @@ def test_arbitrary_bad_token(self): """Confirm bad tokens raise errors.""" import pent - self.assertRaises(pent.BadTokenError, pent.Token, "abcd") + self.assertRaises(pent.TokenError, pent.Token, "abcd") def test_group_enclosures(self): """Ensure 'ignore' flag is properly set.""" From 3c88b6370ada72700a6050d7c43b7a33c8da3eb7 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Wed, 19 Sep 2018 14:16:44 -0400 Subject: [PATCH 37/44] TDD: Add tests for head/tail captures; implement Head and tail captures are working satisfactorily for the ORCA .hess frequencies case. Will need to expand to more complex structures. Next step is implementing the body capture. --- pent/__init__.py | 1 + pent/enums.py | 13 +++++++++ pent/parser.py | 65 ++++++++++++++++++++++++++++++++++++------ pent/test/pent_base.py | 7 +++-- pent/test/testdata.py | 36 +++++++++++------------ 5 files changed, 93 insertions(+), 29 deletions(-) diff --git a/pent/__init__.py b/pent/__init__.py index 632ec33..3405493 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -33,6 +33,7 @@ "Number", "Sign", "TokenField", + "ParserField", "Content", "Quantity", "number_patterns", diff --git a/pent/enums.py b/pent/enums.py index d2569e2..49ea156 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -117,5 +117,18 @@ class TokenField(str, Enum): Sign = "sign" +class ParserField(str, Enum): + """Enumeration for the fields/subsections of a Parser pattern.""" + + #: Header + Head = "head" + + #: Body + Body = "body" + + #: Tail/footer + Tail = "tail" + + if __name__ == "__main__": # pragma: no cover print("Module not executable.") diff --git a/pent/parser.py b/pent/parser.py index 43cadae..50f4d58 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -24,10 +24,13 @@ """ +import itertools as itt +import re + import attr import pyparsing as pp -from .enums import Number, Sign, TokenField +from .enums import Number, Sign, TokenField, ParserField from .enums import Content, Quantity from .errors import TokenError, SectionError from .patterns import std_wordify_open, std_wordify_close @@ -45,8 +48,11 @@ class Parser: def pattern(self): """Return the regex pattern for the entire parser. - The capture groups are NEVER inserted when regex is - generated this way. + The individual capture groups are NEVER inserted when + regex is generated this way. + + Instead, head/body/tail capture groups are inserted, + in order to subdivide matched text by these subsets. """ # Relies on the convert_section default for 'capture_groups' @@ -58,19 +64,45 @@ def pattern(self): rx = "" if rx_head: - rx += rx_head + "\n" + rx += "(?P<{}>".format(ParserField.Head) + rx_head + ")\n" try: # At least one line of the body, followed by however many more - rx += rx_body + "(\n" + rx_body + ")*" + rx += ( + "(?P<{}>".format(ParserField.Body) + + rx_body + + "(\n" + + rx_body + + ")*)" + ) except TypeError as e: raise SectionError("'body' required to generate 'pattern'") from e if rx_tail: - rx += "\n" + rx_tail + rx += "\n(?P<{}>".format(ParserField.Tail) + rx_tail + ")" return rx + def capture_head(self, text): + """Capture all marked values from the pattern head.""" + m_entire = re.search(self.pattern, text) + head = m_entire.group(ParserField.Head) + + pat_capture = self.convert_section(self.head, capture_groups=True) + m_head = re.search(pat_capture, head) + + return list(*map(str.split, self.generate_captures(m_head))) + + def capture_tail(self, text): + """Capture all marked values from the pattern tail.""" + m_entire = re.search(self.pattern, text) + tail = m_entire.group(ParserField.Tail) + + pat_capture = self.convert_section(self.tail, capture_groups=True) + m_tail = re.search(pat_capture, tail) + + return list(*map(str.split, self.generate_captures(m_tail))) + @classmethod def convert_section(cls, sec, capture_groups=False): """Convert the head, body or tail to regex.""" @@ -91,10 +123,16 @@ def convert_section(cls, sec, capture_groups=False): pass # If it's an iterable of lines + def gen_converted_lines(): + id = 0 + for line in sec: + pat, id = cls.convert_line( + line, capture_groups=capture_groups, group_id=id + ) + yield pat + try: - return "\n".join( - cls.convert_line(_, capture_groups=False)[0] for _ in sec - ) + return "\n".join(gen_converted_lines()) except AttributeError: # Most likely is that the iterable members don't have # the .pattern attribute @@ -161,6 +199,15 @@ def convert_line(cls, line, *, capture_groups=True, group_id=0): return pattern, group_id + @staticmethod + def generate_captures(m): + """Generate captures from a regex match.""" + for i in itt.count(0): + try: + yield m.group(Token.group_prefix + str(i)) + except IndexError: + raise StopIteration + @attr.s(slots=True) class Token: diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 3226a9b..6f430a2 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -534,12 +534,12 @@ def test_orca_hess_freq_parser(self): # from .testdata import orca_hess_freqs - head_pattern = ("@.$vibrational_frequencies", "#.+i") + head_pattern = ("@.$vibrational_frequencies", "#!.+i") body_pattern = "#.+i #!..f" # Trivial application of the tail, but serves to check that # it works correctly. - tail_pattern = ("~", "@.$normal_modes", "#++i") + tail_pattern = ("~", "@.$normal_modes", "#!++i") file_path = str(testdir_path / "C2F4_01.hess") @@ -554,6 +554,9 @@ def test_orca_hess_freq_parser(self): self.assertIsNotNone(m) self.assertEqual(m.group(0).count("\n"), 22) + self.assertEqual(freq_parser.capture_head(data), ["18"]) + self.assertEqual(freq_parser.capture_tail(data), ["18", "18"]) + # ... more stuff here diff --git a/pent/test/testdata.py b/pent/test/testdata.py index d18da38..5722945 100644 --- a/pent/test/testdata.py +++ b/pent/test/testdata.py @@ -33,24 +33,24 @@ # ## RESULTS FROM THE ORCA HESS FILE ## orca_hess_freqs = [ - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 0.000000, - 194.490162, - 198.587114, - 389.931897, - 402.713910, - 538.244274, - 542.017838, - 548.246738, - 800.613516, - 1203.096114, - 1342.200360, - 1349.543713, - 1885.157022, + [0.000000], + [0.000000], + [0.000000], + [0.000000], + [0.000000], + [0.000000], + [194.490162], + [198.587114], + [389.931897], + [402.713910], + [538.244274], + [542.017838], + [548.246738], + [800.613516], + [1203.096114], + [1342.200360], + [1349.543713], + [1885.157022], ] From 4fbbfdcde4cb2e705234a21df1790e8dfa3d914c Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Thu, 20 Sep 2018 15:00:04 -0400 Subject: [PATCH 38/44] DEV/REFAC: Token to new .py Also switch _get_number_pattern and _get_string_pattern to regular instance methods. Likelihood of external independent use is low, and Token instantiation should be cheap. Closes #32. If general use of the sub-in-escape-sequences-for-regex-special- characters function is needed, then refactor into a utils module. --- pent/__init__.py | 3 +- pent/parser.py | 247 +----------------------------------------- pent/token.py | 274 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 279 insertions(+), 245 deletions(-) create mode 100644 pent/token.py diff --git a/pent/__init__.py b/pent/__init__.py index 3405493..7678e04 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -47,8 +47,9 @@ from .enums import Number, Sign, TokenField from .enums import Content, Quantity from .errors import PentError, TokenError, SectionError -from .parser import Parser, Token +from .parser import Parser from .patterns import number_patterns, wordify_pattern, std_wordify +from .token import Token __version__ = "0.1.dev2" diff --git a/pent/parser.py b/pent/parser.py index 50f4d58..61fa9cd 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -28,12 +28,11 @@ import re import attr -import pyparsing as pp -from .enums import Number, Sign, TokenField, ParserField -from .enums import Content, Quantity -from .errors import TokenError, SectionError +from .enums import ParserField +from .errors import SectionError from .patterns import std_wordify_open, std_wordify_close +from .token import Token @attr.s(slots=True) @@ -209,245 +208,5 @@ def generate_captures(m): raise StopIteration -@attr.s(slots=True) -class Token: - """Encapsulates transforming mini-language patterns tokens into regex.""" - - from .patterns import number_patterns as _numpats - - #: Mini-language token string to be parsed - token = attr.ib() - - #: Whether group capture should be added or not - do_capture = attr.ib(default=True) - - #: Flag for whether group ID substitution needs to be done - needs_group_id = attr.ib(default=False, init=False, repr=False) - - # Internal pyparsing parse result and generated regex pattern - _pr = attr.ib(default=None, init=False, repr=False) - _pattern = attr.ib(default=None, init=False, repr=False) - - # ##### pyparsing pattern internals ##### - - # ## MINOR PATTERN COMPONENTS ## - group_prefix = "g" - _s_any_flag = "~" - _s_capture = "!" - _s_no_space = "x" - - _pp_no_space = pp.Optional(pp.Literal(_s_no_space)).setResultsName( - TokenField.NoSpace - ) - _pp_capture = pp.Optional(pp.Literal(_s_capture)).setResultsName( - TokenField.Capture - ) - _pp_quantity = pp.Word("".join(Quantity), exact=1).setResultsName( - TokenField.Quantity - ) - - # ## ARBITRARY CONTENT TOKEN ## - # Anything may be matched here, including multiple words. - _pp_any_flag = ( - pp.Literal(_s_any_flag).setResultsName(TokenField.Type) + _pp_capture - ) - - # ## LITERAL STRING TOKEN ## - # Marker for the rest of the token to be a literal string - _pp_str_flag = pp.Literal(Content.String.value).setResultsName( - TokenField.Type - ) - - # Remainder of the content after the marker, spaces included - _pp_str_value = pp.Word(pp.printables + " ").setResultsName(TokenField.Str) - - # Composite pattern for a literal string - _pp_string = ( - _pp_str_flag - + _pp_no_space - + _pp_capture - + _pp_quantity - + _pp_str_value - ) - - # ## NUMERICAL VALUE TOKEN ## - # Initial marker for a numerical value - _pp_num_flag = pp.Literal(Content.Number.value).setResultsName( - TokenField.Type - ) - - # Marker for the sign of the value; period indicates either sign - _pp_num_sign = pp.Word("".join(Sign), exact=1).setResultsName( - TokenField.Sign - ) - - # Marker for the number type to look for - _pp_num_type = pp.Word("".join(Number), exact=1).setResultsName( - TokenField.Number - ) - - # Composite pattern for a number - _pp_number = ( - _pp_num_flag - + _pp_no_space - + _pp_capture - + _pp_quantity - + pp.Group(_pp_num_sign + _pp_num_type).setResultsName( - TokenField.SignNumber - ) - ) - - # ## COMBINED TOKEN PARSER ## - _pp_token = ( - pp.StringStart() - + (_pp_any_flag ^ _pp_string ^ _pp_number) - + pp.StringEnd() - ) - - # Informational properties - @property - def pattern(self): - """Return assembled regex pattern from the token, as |str|.""" - return self._pattern - - @property - def is_any(self): - """Return flag for whether the token is an "any content" token.""" - return self._pr[TokenField.Type] == Content.Any - - @property - def is_str(self): - """Return flag for whether the token matches a literal string.""" - return self._pr[TokenField.Type] == Content.String - - @property - def is_num(self): - """Return flag for whether the token matches a number.""" - return self._pr[TokenField.Type] == Content.Number - - @property - def match_quantity(self): - """Return match quantity; |None| for :attr:`pent.enums.Content.Any`.""" - if self.is_any: - return None - else: - return Quantity(self._pr[TokenField.Quantity]) - - @property - def number(self): - """#: Return number format; |None| if token doesn't match a number.""" - if self.is_num: - return Number(self._pr[TokenField.SignNumber][TokenField.Number]) - else: - return None - - @property - def sign(self): - """#: Return number sign; |None| if token doesn't match a number.""" - if self.is_num: - return Sign(self._pr[TokenField.SignNumber][TokenField.Sign]) - else: - return None - - @property - def space_after(self): - """Return flag for whether post-match space should be provided for.""" - if self.is_any: - return False - else: - return TokenField.NoSpace not in self._pr - - @property - def capture(self): - """Return flag for whether a regex capture group should be created.""" - return TokenField.Capture in self._pr - - def __attrs_post_init__(self): - """Handle automatic creation stuff.""" - try: - self._pr = self._pp_token.parseString(self.token) - except pp.ParseException as e: - raise TokenError(self.token) from e - - if self.is_any: - self._pattern, self.needs_group_id = self._selective_group_enclose( - ".*?" - ) - return - - # Only single, non-optional captures implemented for now, regardless of - # the Quantity flag in the token - if self.is_str: - # Always store the string pattern - self._pattern = self._string_pattern(self._pr[TokenField.Str]) - - # Modify, depending on the Quantity - if self.match_quantity is Quantity.OneOrMore: - self._pattern = "(" + self._pattern + ")+" - - elif self.is_num: - self._pattern = self._get_number_pattern(self._pr) - - if self.match_quantity is Quantity.OneOrMore: - self._pattern += r"([ \t]+{})*".format(self._pattern) - - else: # pragma: no cover - raise NotImplementedError( - "Unknown content type somehow specified!" - ) - - self._pattern, self.needs_group_id = self._selective_group_enclose( - self._pattern - ) - - @staticmethod - def _string_pattern(s): - """Create a literal string pattern from `s`.""" - pattern = "" - - for c in s: - if c in "[\^$.|?*+(){}": - # Must escape regex special characters - pattern += "\\" + c - else: - pattern += c - - return pattern - - @classmethod - def _get_number_pattern(cls, parse_result): - """Return the correct number pattern given the parse result.""" - num = Number(parse_result[TokenField.SignNumber][TokenField.Number]) - sign = Sign(parse_result[TokenField.SignNumber][TokenField.Sign]) - - return cls._numpats[num, sign] - - @classmethod - def _group_open(cls): - """Create the opening pattern for a named group. - - This leaves a formatting placeholder for the invoking Parser - to inject the appropriate group ID. - - """ - return r"(?P<{0}{{0}}>".format(cls.group_prefix) - - @staticmethod - def _group_close(): - """Create the closing pattern for a named group.""" - return ")" - - def _selective_group_enclose(self, pat): - """Return token pattern enclosed in group IF it should be grouped. - - FIX THIS DOCSTRING, IT'S OUT OF DATE!!! - - """ - if self.do_capture and self.capture: - return (self._group_open() + pat + self._group_close(), True) - else: - return pat, False - - if __name__ == "__main__": # pragma: no cover print("Module not executable.") diff --git a/pent/token.py b/pent/token.py new file mode 100644 index 0000000..e61abb4 --- /dev/null +++ b/pent/token.py @@ -0,0 +1,274 @@ +r"""*Token handling for mini-language parser for* ``pent``. + +``pent`` Extracts Numerical Text. + +**Author** + Brian Skinn (bskinn@alum.mit.edu) + +**File Created** + 20 Sep 2018 + +**Copyright** + \(c) Brian Skinn 2018 + +**Source Repository** + http://www.github.com/bskinn/pent + +**Documentation** + http://pent.readthedocs.io + +**License** + The MIT License; see |license_txt|_ for full license terms + +**Members** + +""" + +import attr +import pyparsing as pp + +from .enums import Number, Sign, TokenField +from .enums import Content, Quantity +from .errors import TokenError + + +@attr.s(slots=True) +class Token: + """Encapsulates transforming mini-language patterns tokens into regex.""" + + from .patterns import number_patterns as _numpats + + #: Mini-language token string to be parsed + token = attr.ib() + + #: Whether group capture should be added or not + do_capture = attr.ib(default=True) + + #: Flag for whether group ID substitution needs to be done + needs_group_id = attr.ib(default=False, init=False, repr=False) + + # Internal pyparsing parse result and generated regex pattern + _pr = attr.ib(default=None, init=False, repr=False) + _pattern = attr.ib(default=None, init=False, repr=False) + + # ##### pyparsing pattern internals ##### + + # ## MINOR PATTERN COMPONENTS ## + group_prefix = "g" + _s_any_flag = "~" + _s_capture = "!" + _s_no_space = "x" + + _pp_no_space = pp.Optional(pp.Literal(_s_no_space)).setResultsName( + TokenField.NoSpace + ) + _pp_capture = pp.Optional(pp.Literal(_s_capture)).setResultsName( + TokenField.Capture + ) + _pp_quantity = pp.Word("".join(Quantity), exact=1).setResultsName( + TokenField.Quantity + ) + + # ## ARBITRARY CONTENT TOKEN ## + # Anything may be matched here, including multiple words. + _pp_any_flag = ( + pp.Literal(_s_any_flag).setResultsName(TokenField.Type) + _pp_capture + ) + + # ## LITERAL STRING TOKEN ## + # Marker for the rest of the token to be a literal string + _pp_str_flag = pp.Literal(Content.String.value).setResultsName( + TokenField.Type + ) + + # Remainder of the content after the marker, spaces included + _pp_str_value = pp.Word(pp.printables + " ").setResultsName(TokenField.Str) + + # Composite pattern for a literal string + _pp_string = ( + _pp_str_flag + + _pp_no_space + + _pp_capture + + _pp_quantity + + _pp_str_value + ) + + # ## NUMERICAL VALUE TOKEN ## + # Initial marker for a numerical value + _pp_num_flag = pp.Literal(Content.Number.value).setResultsName( + TokenField.Type + ) + + # Marker for the sign of the value; period indicates either sign + _pp_num_sign = pp.Word("".join(Sign), exact=1).setResultsName( + TokenField.Sign + ) + + # Marker for the number type to look for + _pp_num_type = pp.Word("".join(Number), exact=1).setResultsName( + TokenField.Number + ) + + # Composite pattern for a number + _pp_number = ( + _pp_num_flag + + _pp_no_space + + _pp_capture + + _pp_quantity + + pp.Group(_pp_num_sign + _pp_num_type).setResultsName( + TokenField.SignNumber + ) + ) + + # ## COMBINED TOKEN PARSER ## + _pp_token = ( + pp.StringStart() + + (_pp_any_flag ^ _pp_string ^ _pp_number) + + pp.StringEnd() + ) + + # Informational properties + @property + def pattern(self): + """Return assembled regex pattern from the token, as |str|.""" + return self._pattern + + @property + def is_any(self): + """Return flag for whether the token is an "any content" token.""" + return self._pr[TokenField.Type] == Content.Any + + @property + def is_str(self): + """Return flag for whether the token matches a literal string.""" + return self._pr[TokenField.Type] == Content.String + + @property + def is_num(self): + """Return flag for whether the token matches a number.""" + return self._pr[TokenField.Type] == Content.Number + + @property + def match_quantity(self): + """Return match quantity; |None| for :attr:`pent.enums.Content.Any`.""" + if self.is_any: + return None + else: + return Quantity(self._pr[TokenField.Quantity]) + + @property + def number(self): + """#: Return number format; |None| if token doesn't match a number.""" + if self.is_num: + return Number(self._pr[TokenField.SignNumber][TokenField.Number]) + else: + return None + + @property + def sign(self): + """#: Return number sign; |None| if token doesn't match a number.""" + if self.is_num: + return Sign(self._pr[TokenField.SignNumber][TokenField.Sign]) + else: + return None + + @property + def space_after(self): + """Return flag for whether post-match space should be provided for.""" + if self.is_any: + return False + else: + return TokenField.NoSpace not in self._pr + + @property + def capture(self): + """Return flag for whether a regex capture group should be created.""" + return TokenField.Capture in self._pr + + def __attrs_post_init__(self): + """Handle automatic creation stuff.""" + try: + self._pr = self._pp_token.parseString(self.token) + except pp.ParseException as e: + raise TokenError(self.token) from e + + if self.is_any: + self._pattern, self.needs_group_id = self._selective_group_enclose( + ".*?" + ) + return + + # Only single, non-optional captures implemented for now, regardless of + # the Quantity flag in the token + if self.is_str: + # Always store the string pattern + self._pattern = self._string_pattern() + + # Modify, depending on the Quantity + if self.match_quantity is Quantity.OneOrMore: + self._pattern = "(" + self._pattern + ")+" + + elif self.is_num: + self._pattern = self._get_number_pattern() + + if self.match_quantity is Quantity.OneOrMore: + self._pattern += r"([ \t]+{})*".format(self._pattern) + + else: # pragma: no cover + raise NotImplementedError( + "Unknown content type somehow specified!" + ) + + self._pattern, self.needs_group_id = self._selective_group_enclose( + self._pattern + ) + + def _string_pattern(self): + """Create a literal string pattern from the parse result.""" + pattern = "" + + for c in self._pr[TokenField.Str]: + if c in "[\^$.|?*+(){}": + # Must escape regex special characters + pattern += "\\" + c + else: + pattern += c + + return pattern + + def _get_number_pattern(self): + """Return the correct number pattern given the parse result.""" + num = Number(self._pr[TokenField.SignNumber][TokenField.Number]) + sign = Sign(self._pr[TokenField.SignNumber][TokenField.Sign]) + + return self._numpats[num, sign] + + @classmethod + def _group_open(cls): + """Create the opening pattern for a named group. + + This leaves a formatting placeholder for the invoking Parser + to inject the appropriate group ID. + + """ + return r"(?P<{0}{{0}}>".format(cls.group_prefix) + + @staticmethod + def _group_close(): + """Create the closing pattern for a named group.""" + return ")" + + def _selective_group_enclose(self, pat): + """Return token pattern enclosed in group IF it should be grouped. + + FIX THIS DOCSTRING, IT'S OUT OF DATE!!! + + """ + if self.do_capture and self.capture: + return (self._group_open() + pat + self._group_close(), True) + else: + return pat, False + + +if __name__ == "__main__": # pragma: no cover + print("Module not executable.") From 7b6e3be8125ae9fc5acf10bf3fd088f56d658dc5 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Fri, 21 Sep 2018 12:46:10 -0400 Subject: [PATCH 39/44] TDD: Add test for freqs, implement Worked first-try for a single-block result! Expected to fail for a multi-block result. --- pent/parser.py | 25 +++++++++++++++++++++++++ pent/test/pent_base.py | 6 ++++-- pent/test/testdata.py | 36 ++++++++++++++++++------------------ 3 files changed, 47 insertions(+), 20 deletions(-) diff --git a/pent/parser.py b/pent/parser.py index 61fa9cd..4c37cd9 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -102,6 +102,31 @@ def capture_tail(self, text): return list(*map(str.split, self.generate_captures(m_tail))) + def capture_body(self, text): + """Capture all values from the pattern body, recursing if needed.""" + m_entire = re.search(self.pattern, text) + body = m_entire.group(ParserField.Body) + + # If the 'body' pattern is a Parser + try: + return self.body.capture_body(body) + except AttributeError: + pass + + # If the 'body' pattern is a string + try: + pat = self.convert_line(self.body, capture_groups=True)[0] + except AttributeError: + raise SectionError("Invalid 'body' pattern for capture") + else: + caps = [] + for m in re.finditer(pat, text): + caps.append(list(*map(str.split, self.generate_captures(m)))) + + return caps + + # Iterable of lines? + @classmethod def convert_section(cls, sec, capture_groups=False): """Convert the head, body or tail to regex.""" diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 6f430a2..3d89de2 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -532,7 +532,7 @@ def test_orca_hess_freq_parser(self): """Confirm 1-D data parser for ORCA freqs works.""" import pent - # from .testdata import orca_hess_freqs + from .testdata import orca_hess_freqs head_pattern = ("@.$vibrational_frequencies", "#!.+i") body_pattern = "#.+i #!..f" @@ -557,7 +557,9 @@ def test_orca_hess_freq_parser(self): self.assertEqual(freq_parser.capture_head(data), ["18"]) self.assertEqual(freq_parser.capture_tail(data), ["18", "18"]) - # ... more stuff here + self.assertEqual(freq_parser.capture_body(data), orca_hess_freqs) + + # ... more stuff here? class TestPentTokens(ut.TestCase, SuperPent): diff --git a/pent/test/testdata.py b/pent/test/testdata.py index 5722945..5103856 100644 --- a/pent/test/testdata.py +++ b/pent/test/testdata.py @@ -33,24 +33,24 @@ # ## RESULTS FROM THE ORCA HESS FILE ## orca_hess_freqs = [ - [0.000000], - [0.000000], - [0.000000], - [0.000000], - [0.000000], - [0.000000], - [194.490162], - [198.587114], - [389.931897], - [402.713910], - [538.244274], - [542.017838], - [548.246738], - [800.613516], - [1203.096114], - [1342.200360], - [1349.543713], - [1885.157022], + ["0.000000"], + ["0.000000"], + ["0.000000"], + ["0.000000"], + ["0.000000"], + ["0.000000"], + ["194.490162"], + ["198.587114"], + ["389.931897"], + ["402.713910"], + ["538.244274"], + ["542.017838"], + ["548.246738"], + ["800.613516"], + ["1203.096114"], + ["1342.200360"], + ["1349.543713"], + ["1885.157022"], ] From ef0b8c6528ff9a0c046e67c46ae2afb08110b552 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Fri, 21 Sep 2018 13:30:17 -0400 Subject: [PATCH 40/44] DEV: Body capture working for single blocks Much more complex logic is needed for assembly of a multi-block field. --- pent/parser.py | 6 +++--- pent/test/pent_base.py | 27 ++++++++++++++++++++++++++- pent/test/testdata.py | 24 +++++++++++++++++++++++- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/pent/parser.py b/pent/parser.py index 4c37cd9..a54ca37 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -105,11 +105,11 @@ def capture_tail(self, text): def capture_body(self, text): """Capture all values from the pattern body, recursing if needed.""" m_entire = re.search(self.pattern, text) - body = m_entire.group(ParserField.Body) + body_text = m_entire.group(ParserField.Body) # If the 'body' pattern is a Parser try: - return self.body.capture_body(body) + return self.body.capture_body(body_text) except AttributeError: pass @@ -120,7 +120,7 @@ def capture_body(self, text): raise SectionError("Invalid 'body' pattern for capture") else: caps = [] - for m in re.finditer(pat, text): + for m in re.finditer(pat, body_text): caps.append(list(*map(str.split, self.generate_captures(m)))) return caps diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 3d89de2..3b1c6db 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -99,6 +99,15 @@ class TestPentParserPatterns(ut.TestCase, SuperPent): prs = pent.Parser() + def test_empty_pattern_matches_blank_line(self): + """Confirm an empty pattern matches only a blank line.""" + import pent + + prs = pent.Parser(body="") + + self.assertIsNotNone(re.search(prs.pattern, "")) + self.assertIsNone(re.search(prs.pattern, "3")) + def test_group_tags_or_not(self): """Confirm group tags are added when needed; omitted when not.""" import pent @@ -559,7 +568,23 @@ def test_orca_hess_freq_parser(self): self.assertEqual(freq_parser.capture_body(data), orca_hess_freqs) - # ... more stuff here? + def test_orca_hess_dipders_parser(self): + """Confirm 2-D single-block data parser for ORCA dipders works.""" + import pent + + from .testdata import orca_hess_dipders + + head_pattern = ("@.$dipole_derivatives", "#.+i") + body_pattern = "#!+.f" + + file_path = str(testdir_path / "C2F4_01.hess") + + freq_parser = pent.Parser(head=head_pattern, body=body_pattern) + + with open(file_path) as f: + data = f.read() + + self.assertEqual(freq_parser.capture_body(data), orca_hess_dipders) class TestPentTokens(ut.TestCase, SuperPent): diff --git a/pent/test/testdata.py b/pent/test/testdata.py index 5103856..8cce71b 100644 --- a/pent/test/testdata.py +++ b/pent/test/testdata.py @@ -30,7 +30,29 @@ from pent import Number, Sign -# ## RESULTS FROM THE ORCA HESS FILE ## +# ## RESULTS FROM THE C2F4_01.hess FILE ## + +orca_hess_dipders = [ + ["-1.041194", "0.077425", "-0.236446"], + ["0.021347", "-0.053665", "0.064080"], + ["-0.064917", "0.064030", "-0.228121"], + ["1.334075", "0.025554", "-0.077999"], + ["0.025455", "0.179824", "-0.349540"], + ["-0.078013", "-0.349384", "1.132034"], + ["-0.292885", "-0.102966", "0.314406"], + ["-0.046806", "-0.126137", "0.285463"], + ["0.142946", "0.285361", "-0.903922"], + ["1.334072", "0.025555", "-0.078001"], + ["0.025456", "0.179823", "-0.349540"], + ["-0.078015", "-0.349384", "1.132038"], + ["-1.041194", "0.077424", "-0.236443"], + ["0.021345", "-0.053666", "0.064079"], + ["-0.064912", "0.064030", "-0.228121"], + ["-0.292884", "-0.102965", "0.314403"], + ["-0.046806", "-0.126136", "0.285464"], + ["0.142945", "0.285362", "-0.903926"], +] + orca_hess_freqs = [ ["0.000000"], From 1b32834a1de35a977825ea8e4e3aba26e2cd0e8b Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Fri, 21 Sep 2018 14:00:59 -0400 Subject: [PATCH 41/44] TDD: Implement multiline body strings Iterable of strings for the body. Seems to work nicely! Now have to figure out implementation of the multi-block, with body=Parser(...). Rectangular 2D dataset with a single block works ok, satisfying part of #34. Still need to implement and test, again, a MULTI-block rectangular (and square) dataset. --- pent/parser.py | 13 +++++++------ pent/test/pent_base.py | 13 +++++++++++++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pent/parser.py b/pent/parser.py index a54ca37..e5b9455 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -113,20 +113,21 @@ def capture_body(self, text): except AttributeError: pass - # If the 'body' pattern is a string + # If the 'body' pattern is a string or iterable of strings try: - pat = self.convert_line(self.body, capture_groups=True)[0] + pat = self.convert_section(self.body, capture_groups=True) except AttributeError: raise SectionError("Invalid 'body' pattern for capture") else: caps = [] for m in re.finditer(pat, body_text): - caps.append(list(*map(str.split, self.generate_captures(m)))) + line_caps = [] + for c in self.generate_captures(m): + line_caps.extend(str.split(c)) + caps.append(line_caps) return caps - # Iterable of lines? - @classmethod def convert_section(cls, sec, capture_groups=False): """Convert the head, body or tail to regex.""" @@ -230,7 +231,7 @@ def generate_captures(m): try: yield m.group(Token.group_prefix + str(i)) except IndexError: - raise StopIteration + break if __name__ == "__main__": # pragma: no cover diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 3b1c6db..d13b619 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -537,6 +537,19 @@ def test_quick_one_or_more_number(self): m_pat_period.group(pent.Token.group_prefix + "0"), numbers ) + def test_multiline_body_parser(self): + """Confirm parsing w/multi-line body works ok.""" + import pent + + result = [["1", "2", "4"]] + + text = "\n1\n\n2\n\n\n4" + + pat = ("", "#!.+i", "", "#!.+i", "", "", "#!.+i") + prs = pent.Parser(body=pat) + + self.assertEqual(prs.capture_body(text), result) + def test_orca_hess_freq_parser(self): """Confirm 1-D data parser for ORCA freqs works.""" import pent From 3bebd53cb80ad4fdd09829a0ed8e2c3b480c7331 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Fri, 21 Sep 2018 23:26:58 -0400 Subject: [PATCH 42/44] DEV: Implement multiblock capture Works to grab when only one instance is in the data; need to extend to capture ALL times the data occurs, e.g. for geometry optimization stats. [skip ci] --- pent/__init__.py | 2 +- pent/parser.py | 60 +++++++++++++++++++++++++++++------------- pent/test/pent_base.py | 37 +++++++++++++++++++++++--- pent/test/testdata.py | 5 ++++ 4 files changed, 81 insertions(+), 23 deletions(-) diff --git a/pent/__init__.py b/pent/__init__.py index 7678e04..ebbf69f 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -44,7 +44,7 @@ "SectionError", ] -from .enums import Number, Sign, TokenField +from .enums import Number, Sign, TokenField, ParserField from .enums import Content, Quantity from .errors import PentError, TokenError, SectionError from .parser import Parser diff --git a/pent/parser.py b/pent/parser.py index e5b9455..921765e 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -43,8 +43,7 @@ class Parser: body = attr.ib(default=None) tail = attr.ib(default=None) - @property - def pattern(self): + def pattern(self, capture_sections=True): """Return the regex pattern for the entire parser. The individual capture groups are NEVER inserted when @@ -52,39 +51,58 @@ def pattern(self): Instead, head/body/tail capture groups are inserted, in order to subdivide matched text by these subsets. + These 'section' capture groups are ONLY inserted for the + top-level Parser, though -- they are suppressed for inner + nested Parsers. """ # Relies on the convert_section default for 'capture_groups' # as False. - rx_head, rx_body, rx_tail = map( - self.convert_section, (self.head, self.body, self.tail) - ) + rx_head = self.convert_section(self.head, capture_sections=False) + rx_body = self.convert_section(self.body, capture_sections=False) + rx_tail = self.convert_section(self.tail, capture_sections=False) + # rx_head, rx_body, rx_tail = map( + # self.convert_section, (self.head, self.body, self.tail) + # ) rx = "" if rx_head: - rx += "(?P<{}>".format(ParserField.Head) + rx_head + ")\n" + rx += ( + "(?P<{}>".format(ParserField.Head) + rx_head + ")\n" + if capture_sections + else rx_head + "\n" + ) try: # At least one line of the body, followed by however many more rx += ( - "(?P<{}>".format(ParserField.Body) + ( + "(?P<{}>".format(ParserField.Body) + if capture_sections + else "" + ) + rx_body + "(\n" + rx_body - + ")*)" + + ")*" + + (")" if capture_sections else "") ) except TypeError as e: raise SectionError("'body' required to generate 'pattern'") from e if rx_tail: - rx += "\n(?P<{}>".format(ParserField.Tail) + rx_tail + ")" + rx += ( + "\n(?P<{}>".format(ParserField.Tail) + rx_tail + ")" + if capture_sections + else "\n" + rx_tail + ) return rx def capture_head(self, text): """Capture all marked values from the pattern head.""" - m_entire = re.search(self.pattern, text) + m_entire = re.search(self.pattern(), text) head = m_entire.group(ParserField.Head) pat_capture = self.convert_section(self.head, capture_groups=True) @@ -94,7 +112,7 @@ def capture_head(self, text): def capture_tail(self, text): """Capture all marked values from the pattern tail.""" - m_entire = re.search(self.pattern, text) + m_entire = re.search(self.pattern(), text) tail = m_entire.group(ParserField.Tail) pat_capture = self.convert_section(self.tail, capture_groups=True) @@ -104,14 +122,18 @@ def capture_tail(self, text): def capture_body(self, text): """Capture all values from the pattern body, recursing if needed.""" - m_entire = re.search(self.pattern, text) + m_entire = re.search(self.pattern(), text) body_text = m_entire.group(ParserField.Body) # If the 'body' pattern is a Parser - try: - return self.body.capture_body(body_text) - except AttributeError: - pass + if isinstance(self.body, self.__class__): + data = [] + body_subpat = self.body.pattern(capture_sections=True) + + for m in re.finditer(body_subpat, body_text): + data.append(self.body.capture_body(m.group(0))) + + return data # If the 'body' pattern is a string or iterable of strings try: @@ -123,13 +145,13 @@ def capture_body(self, text): for m in re.finditer(pat, body_text): line_caps = [] for c in self.generate_captures(m): - line_caps.extend(str.split(c)) + line_caps.extend(c.split()) caps.append(line_caps) return caps @classmethod - def convert_section(cls, sec, capture_groups=False): + def convert_section(cls, sec, capture_groups=False, capture_sections=True): """Convert the head, body or tail to regex.""" # Could be None if sec is None: @@ -137,7 +159,7 @@ def convert_section(cls, sec, capture_groups=False): # If it's a Parser try: - return sec.pattern + return sec.pattern(capture_sections=capture_sections) except AttributeError: pass diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index d13b619..f0046dd 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -105,8 +105,8 @@ def test_empty_pattern_matches_blank_line(self): prs = pent.Parser(body="") - self.assertIsNotNone(re.search(prs.pattern, "")) - self.assertIsNone(re.search(prs.pattern, "3")) + self.assertIsNotNone(re.search(prs.pattern(), "")) + self.assertIsNone(re.search(prs.pattern(), "3")) def test_group_tags_or_not(self): """Confirm group tags are added when needed; omitted when not.""" @@ -572,7 +572,7 @@ def test_orca_hess_freq_parser(self): with open(file_path) as f: data = f.read() - m = re.search(freq_parser.pattern, data) + m = re.search(freq_parser.pattern(), data) self.assertIsNotNone(m) self.assertEqual(m.group(0).count("\n"), 22) @@ -599,6 +599,37 @@ def test_orca_hess_dipders_parser(self): self.assertEqual(freq_parser.capture_body(data), orca_hess_dipders) + def test_simple_multiblock(self): + """Confirm simple multiblock parser works correctly.""" + from textwrap import dedent + + import pent + + from .testdata import mblock_result + + data = dedent( + """ + test + + more test + + $data + 1 2 3 + 1 2.5 -3.5 0.8 + 2 -1.2 8.1 -9.2 + + 4 5 6 + 1 -0.1 3.5 8.1 + 2 1.4 2.2 -4.7 + + $next_data""" + ) + + prs_inner = pent.Parser(head="#++i", body="#.+i #!+.f", tail="") + prs_outer = pent.Parser(head="@.$data", body=prs_inner) + + self.assertEqual(prs_outer.capture_body(data), mblock_result) + class TestPentTokens(ut.TestCase, SuperPent): """Direct tests on the Token class.""" diff --git a/pent/test/testdata.py b/pent/test/testdata.py index 8cce71b..f3ed75c 100644 --- a/pent/test/testdata.py +++ b/pent/test/testdata.py @@ -29,6 +29,11 @@ from pent import Number, Sign +mblock_result = [ + [["2.5", "-3.5", "0.8"], ["-1.2", "8.1", "-9.2"]], + [["-0.1", "3.5", "8.1"], ["1.4", "2.2", "-4.7"]], +] + # ## RESULTS FROM THE C2F4_01.hess FILE ## From 0801ba866760233abbd14f5d4efb8c230011f756 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Sat, 22 Sep 2018 12:53:12 -0400 Subject: [PATCH 43/44] TDD: Implement repeated multiblock capture Toy tests appear to be working as expected. Real test will be on real data. Closes #34. Closes #36. Closes #40. --- pent/parser.py | 51 ++++++++++++----------- pent/test/pent_base.py | 35 +++++++++++++++- pent/test/testdata.py | 93 +++++++++++++++++++++++++----------------- 3 files changed, 117 insertions(+), 62 deletions(-) diff --git a/pent/parser.py b/pent/parser.py index 921765e..b52ce8a 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -122,33 +122,38 @@ def capture_tail(self, text): def capture_body(self, text): """Capture all values from the pattern body, recursing if needed.""" - m_entire = re.search(self.pattern(), text) - body_text = m_entire.group(ParserField.Body) + cap_blocks = [] + for m_entire in re.finditer(self.pattern(), text): + block_text = m_entire.group(ParserField.Body) - # If the 'body' pattern is a Parser - if isinstance(self.body, self.__class__): - data = [] - body_subpat = self.body.pattern(capture_sections=True) + # If the 'body' pattern is a Parser + if isinstance(self.body, self.__class__): + data = [] + body_subpat = self.body.pattern(capture_sections=True) - for m in re.finditer(body_subpat, body_text): - data.append(self.body.capture_body(m.group(0))) + for m in re.finditer(body_subpat, block_text): + data.extend(self.body.capture_body(m.group(0))) - return data + cap_blocks.append(data) + continue - # If the 'body' pattern is a string or iterable of strings - try: - pat = self.convert_section(self.body, capture_groups=True) - except AttributeError: - raise SectionError("Invalid 'body' pattern for capture") - else: - caps = [] - for m in re.finditer(pat, body_text): - line_caps = [] - for c in self.generate_captures(m): - line_caps.extend(c.split()) - caps.append(line_caps) - - return caps + # If the 'body' pattern is a string or iterable of strings + try: + pat = self.convert_section(self.body, capture_groups=True) + except AttributeError: + raise SectionError("Invalid 'body' pattern for capture") + else: + data = [] + for m in re.finditer(pat, block_text): + line_caps = [] + for c in self.generate_captures(m): + line_caps.extend(c.split()) + data.append(line_caps) + + cap_blocks.append(data) + continue + + return cap_blocks @classmethod def convert_section(cls, sec, capture_groups=False, capture_sections=True): diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index f0046dd..d351a75 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -541,7 +541,7 @@ def test_multiline_body_parser(self): """Confirm parsing w/multi-line body works ok.""" import pent - result = [["1", "2", "4"]] + result = [[["1", "2", "4"]]] text = "\n1\n\n2\n\n\n4" @@ -630,6 +630,39 @@ def test_simple_multiblock(self): self.assertEqual(prs_outer.capture_body(data), mblock_result) + def test_repeated_multiblock(self): + """Confirm repeated multiblock parser works correctly.""" + from textwrap import dedent + + import pent + + from .testdata import mblock_repeated_result + + data = dedent( + """ + $top + 1 2 3 + 0.2 0.3 0.4 + 0.3 0.4 0.6 + 4 5 6 + 0.1 0.1 0.1 + 0.5 0.5 0.5 + + $top + 7 8 9 + 0.2 0.2 0.2 + 0.6 0.6 0.6 + 1 2 3 + 0.4 0.4 0.4 + 0.8 0.8 0.8 + """ + ) + + prs_inner = pent.Parser(head="#++i", body="#!+.f") + prs_outer = pent.Parser(head="@.$top", body=prs_inner) + + self.assertEqual(prs_outer.capture_body(data), mblock_repeated_result) + class TestPentTokens(ut.TestCase, SuperPent): """Direct tests on the Token class.""" diff --git a/pent/test/testdata.py b/pent/test/testdata.py index f3ed75c..dfab911 100644 --- a/pent/test/testdata.py +++ b/pent/test/testdata.py @@ -29,55 +29,72 @@ from pent import Number, Sign +mblock_repeated_result = [ + [ + [["0.2", "0.3", "0.4"], ["0.3", "0.4", "0.6"]], + [["0.1", "0.1", "0.1"], ["0.5", "0.5", "0.5"]], + ], + [ + [["0.2", "0.2", "0.2"], ["0.6", "0.6", "0.6"]], + [["0.4", "0.4", "0.4"], ["0.8", "0.8", "0.8"]], + ], +] + mblock_result = [ - [["2.5", "-3.5", "0.8"], ["-1.2", "8.1", "-9.2"]], - [["-0.1", "3.5", "8.1"], ["1.4", "2.2", "-4.7"]], + [ + [["2.5", "-3.5", "0.8"], ["-1.2", "8.1", "-9.2"]], + [["-0.1", "3.5", "8.1"], ["1.4", "2.2", "-4.7"]], + ] ] # ## RESULTS FROM THE C2F4_01.hess FILE ## orca_hess_dipders = [ - ["-1.041194", "0.077425", "-0.236446"], - ["0.021347", "-0.053665", "0.064080"], - ["-0.064917", "0.064030", "-0.228121"], - ["1.334075", "0.025554", "-0.077999"], - ["0.025455", "0.179824", "-0.349540"], - ["-0.078013", "-0.349384", "1.132034"], - ["-0.292885", "-0.102966", "0.314406"], - ["-0.046806", "-0.126137", "0.285463"], - ["0.142946", "0.285361", "-0.903922"], - ["1.334072", "0.025555", "-0.078001"], - ["0.025456", "0.179823", "-0.349540"], - ["-0.078015", "-0.349384", "1.132038"], - ["-1.041194", "0.077424", "-0.236443"], - ["0.021345", "-0.053666", "0.064079"], - ["-0.064912", "0.064030", "-0.228121"], - ["-0.292884", "-0.102965", "0.314403"], - ["-0.046806", "-0.126136", "0.285464"], - ["0.142945", "0.285362", "-0.903926"], + [ + ["-1.041194", "0.077425", "-0.236446"], + ["0.021347", "-0.053665", "0.064080"], + ["-0.064917", "0.064030", "-0.228121"], + ["1.334075", "0.025554", "-0.077999"], + ["0.025455", "0.179824", "-0.349540"], + ["-0.078013", "-0.349384", "1.132034"], + ["-0.292885", "-0.102966", "0.314406"], + ["-0.046806", "-0.126137", "0.285463"], + ["0.142946", "0.285361", "-0.903922"], + ["1.334072", "0.025555", "-0.078001"], + ["0.025456", "0.179823", "-0.349540"], + ["-0.078015", "-0.349384", "1.132038"], + ["-1.041194", "0.077424", "-0.236443"], + ["0.021345", "-0.053666", "0.064079"], + ["-0.064912", "0.064030", "-0.228121"], + ["-0.292884", "-0.102965", "0.314403"], + ["-0.046806", "-0.126136", "0.285464"], + ["0.142945", "0.285362", "-0.903926"], + ] ] orca_hess_freqs = [ - ["0.000000"], - ["0.000000"], - ["0.000000"], - ["0.000000"], - ["0.000000"], - ["0.000000"], - ["194.490162"], - ["198.587114"], - ["389.931897"], - ["402.713910"], - ["538.244274"], - ["542.017838"], - ["548.246738"], - ["800.613516"], - ["1203.096114"], - ["1342.200360"], - ["1349.543713"], - ["1885.157022"], + [ + ["0.000000"], + ["0.000000"], + ["0.000000"], + ["0.000000"], + ["0.000000"], + ["0.000000"], + ["194.490162"], + ["198.587114"], + ["389.931897"], + ["402.713910"], + ["538.244274"], + ["542.017838"], + ["548.246738"], + ["800.613516"], + ["1203.096114"], + ["1342.200360"], + ["1349.543713"], + ["1885.157022"], + ] ] From f1dffda01790a1946caa7b4bfa7221b90bddc506 Mon Sep 17 00:00:00 2001 From: Brian Skinn Date: Sun, 23 Sep 2018 21:40:49 -0400 Subject: [PATCH 44/44] REL: Quick buff for alpha release --- CHANGELOG.md | 12 +++++++++++- LICENSE.txt | 2 +- README.rst | 6 ++++-- doc/source/conf.py | 2 +- doc/source/index.rst | 3 +++ pent/__init__.py | 2 +- requirements-dev.txt | 2 ++ requirements-rtd.txt | 4 ++++ setup.py | 4 ++-- 9 files changed, 29 insertions(+), 8 deletions(-) create mode 100644 requirements-rtd.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index a189f02..6664369 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### [Unreleased] +... + + +### v0.1.0 [2018-09-23] + #### Features - * ... \ No newline at end of file + * Three token types implemented to date: numeric, string-literal, "any" + * Parsing of multiple levels of recursive nested data; tested only + to two leves of nesting to date. + * Each nested level of structure can have head/body/tail + * Captured tokens can be easily retrieved from head/tail at the top level + parser; no good head or tail capture yet from within nested structures diff --git a/LICENSE.txt b/LICENSE.txt index 16ed8fc..31e9f1a 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2017 Brian Skinn +Copyright (c) 2018 Brian Skinn Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.rst b/README.rst index 86b1d9a..13c9048 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,11 @@ pent ==== -*Pent Extracts Numerical Text* +*pent Extracts Numerical Text* Mini-language driven parser for structured numerical data. -*Development impending...* +Alpha release(s) available on PyPI: ``pip install pent`` + +Documentation (pending) is at http://pent.readthedocs.io. diff --git a/doc/source/conf.py b/doc/source/conf.py index 84f5665..b8d5334 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -27,7 +27,7 @@ # The short X.Y version version = '0.1' # The full version, including alpha/beta/rc tags -release = '0.1dev1' +release = '0.1' # -- General configuration --------------------------------------------------- diff --git a/doc/source/index.rst b/doc/source/index.rst index 11228ff..5f0cfbf 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -6,6 +6,9 @@ Welcome to pent's documentation! ================================ +*Docs pending...* + + .. toctree:: :maxdepth: 2 :caption: Contents: diff --git a/pent/__init__.py b/pent/__init__.py index ebbf69f..3bad0d5 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -52,4 +52,4 @@ from .token import Token -__version__ = "0.1.dev2" +__version__ = "0.1" diff --git a/requirements-dev.txt b/requirements-dev.txt index da82566..5db4161 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,4 +10,6 @@ tox wget pyparsing black +restview +twine diff --git a/requirements-rtd.txt b/requirements-rtd.txt new file mode 100644 index 0000000..047292c --- /dev/null +++ b/requirements-rtd.txt @@ -0,0 +1,4 @@ +attrs>=17.1,<18 +sphinx==1.7.6 +sphinx-issues +sphinx-rtd-theme==0.4.1 diff --git a/setup.py b/setup.py index e438e49..b17bd61 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ def readme(): setup( name="pent", version=__version__, - description="Pent Extracts Numerical Text", + description="pent Extracts Numerical Text", long_description=readme(), url="https://www.github.com/bskinn/pent", license="MIT License", @@ -35,6 +35,6 @@ def readme(): "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Utilities", - "Development Status :: 2 - Pre-Alpha", + "Development Status :: 3 - Alpha", ], )