diff --git a/.travis.yml b/.travis.yml index 179196f..242b232 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,16 +1,19 @@ +dist: xenial install: - pip install -r requirements-travis.txt -# - pip install -e . -# - sh -c 'cd doc; make html; mkdir scratch' language: python python: - - 3.4 - 3.5 - 3.6 - - 3.7-dev + - 3.7 + - 3.8-dev script: + - python --version + - pip list - coverage run tests.py -a - flake8 pent -# - echo $TRAVIS_PYTHON_VERSION | grep -e '^3\.6' && sh -c 'cd doc; make doctest' || echo 'No doctest.' - - echo $TRAVIS_PYTHON_VERSION | grep -e '^3\.6' && codecov || echo "No codecov." + - do_rest=$( echo $TRAVIS_PYTHON_VERSION | grep -e '^3\.6' | wc -l ) +# - if [ $do_rest -gt 0 ]; then pip install black; black --check .; else echo "No black."; fi +# - if [ $do_rest -gt 0 ]; then sh -c 'cd doc; make doctest'; else echo "No doctest."; fi + - if [ $do_rest -gt 0 ]; then codecov; else echo "No codecov."; fi diff --git a/CHANGELOG.md b/CHANGELOG.md index e8bf55c..5c705d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,35 +5,46 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -### [Unreleased] -... +### v0.2 [2019-10-26] + +#### Fixed + +- Optional-line flag behavior now fixed and ~thoroughly tested. + Required a change to the accepted behavior, such that an optional pattern + matches (1) the line specified, (2) a blank line, or (3) no line. + +#### Changed + +- The semantics of the 'decimal' and 'float' number tokens have been swapped. + 'Decimal' tokens will only match non-scientific-notation decimal values, while + 'float' values match either 'decimal' or 'scinot' formatted values. ### v0.2.0rc1 [2018-10-28] #### Added - * "Misc" token type, '&', matching arbitrary non-whitespace content - * Optional whitespace can now be specified after number, literal, and misc - tokens, in addition to 'required whitespace after' and - 'no whitespace after' - * New helper function `column_stack_2d` - * Needs performance improvements for large arrays - * New 'optional line' token type - * Works irregularly, perhaps due to quirks in managing optional - groups/capture groups within the Python regex engine? - * New property flags on `Token` to indicate the new features added - ('misc' token type, optional-whitespace-after, etc.) +- "Misc" token type, '&', matching arbitrary non-whitespace content +- Optional whitespace can now be specified after number, literal, and misc + tokens, in addition to 'required whitespace after' and + 'no whitespace after' +- New helper function `column_stack_2d` + - Needs performance improvements for large arrays +- New 'optional line' token type + - Works irregularly, perhaps due to quirks in managing optional + groups/capture groups within the Python regex engine? +- New property flags on `Token` to indicate the new features added + ('misc' token type, optional-whitespace-after, etc.) #### Changed - * Switched certain lists within the `Parser.capture_struct` return - dict structure to a type that automatically passes through a dict key to the - single element of those lists, if they are length-one. This - simplifies the syntax of a number of use cases by eliminating explicit `[0]` - indexing. - * `Parser` instances now syntax-check their `head`/`body`/`tail` patterns +- Switched certain lists within the `Parser.capture_struct` return + dict structure to a type that automatically passes through a dict key to the + single element of those lists, if they are length-one. This + simplifies the syntax of a number of use cases by eliminating explicit `[0]` + indexing. +- `Parser` instances now syntax-check their `head`/`body`/`tail` patterns at instantiation, instead of at the first capture attempt. @@ -41,9 +52,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. #### Features - * Three token types implemented to date: numeric, string-literal, "any" - * Parsing of multiple levels of recursive nested data; tested only - to two leves of nesting to date. - * Each nested level of structure can have head/body/tail - * Captured tokens can be easily retrieved from head/tail at the top level - parser; no good head or tail capture yet from within nested structures +- Three token types implemented to date: numeric, string-literal, "any" +- Parsing of multiple levels of recursive nested data; tested only + to two leves of nesting to date. +- Each nested level of structure can have head/body/tail +- Captured tokens can be easily retrieved from head/tail at the top level + parser; no good head or tail capture yet from within nested structures + diff --git a/LICENSE.txt b/LICENSE.txt index 31e9f1a..074cfdb 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2018 Brian Skinn +Copyright (c) 2018-2019 Brian Skinn Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.rst b/README.rst index 38609d4..9b22019 100644 --- a/README.rst +++ b/README.rst @@ -1,33 +1,34 @@ pent Extracts Numerical Text ============================ -*Mini-language driven parser for structured numerical data* +*Mini-language driven parser for structured numerical (or other) data +in free text* **Current Development Version:** -.. image:: https://travis-ci.org/bskinn/pent.svg?branch=dev +.. image:: https://img.shields.io/travis/bskinn/pent?label=travis-ci&logo=travis :target: https://travis-ci.org/bskinn/pent -.. image:: https://codecov.io/gh/bskinn/pent/branch/dev/graph/badge.svg +.. image:: https://codecov.io/gh/bskinn/pent/branch/master/graph/badge.svg :target: https://codecov.io/gh/bskinn/pent **Most Recent Stable Release:** -.. image:: https://img.shields.io/pypi/v/pent.svg +.. image:: https://img.shields.io/pypi/v/pent.svg?logo=pypi :target: https://pypi.org/project/pent -.. image:: https://img.shields.io/pypi/pyversions/pent.svg +.. image:: https://img.shields.io/pypi/pyversions/pent.svg?logo=python **Info:** -.. image:: https://img.shields.io/readthedocs/pent/latest.svg +.. image:: https://img.shields.io/readthedocs/pent/latest :target: http://pent.readthedocs.io/en/latest/ .. image:: https://img.shields.io/github/license/mashape/apistatus.svg - :target: https://github.com/bskinn/pent/blob/master/LICENSE.txt + :target: https://github.com/bskinn/pent/blob/stable/LICENSE.txt .. image:: https://img.shields.io/badge/code%20style-black-000000.svg - :target: https://github.com/ambv/black + :target: https://github.com/psf/black ---- @@ -73,8 +74,8 @@ but that's just exhausting drudgery if there are dozens of files involved. Automating the parsing via a line-by-line string search would work fine (this is how |cclib|_ implements its data imports), but a new line-by-line -method must be implemented any time one encounters a new kind of dataset, -and any time the formatting of a given dataset changes between software versions. +method is needed for every new kind of dataset, +and any time the formatting of a given dataset changes. It's not *too* hard to `write regex `__ @@ -93,8 +94,7 @@ of lines, without writing **any** regex at all: .. code:: python - >>> with (pathlib.Path() / "pent" / "test" / "C2F4_01.hess").open() as f: - ... data = f.read() + >>> data = pathlib.Path("pent", "test", "C2F4_01.hess").read_text() >>> prs = pent.Parser( ... head=("@.$vibrational_frequencies", "#.+i"), ... body=("#.+i #!..f") @@ -127,7 +127,7 @@ column vector, because the data runs down the column in the file. ``pent`` can handle larger, more deeply nested data as well. Take `this 18x18 matrix `__ within ``C2F4_01.hess``, for example. -Here, it's necessary to pass a ``Parser`` as the `body` of another ``Parser``: +Here, it's necessary to pass a ``Parser`` as the *body* of another ``Parser``: .. code:: python @@ -139,16 +139,18 @@ Here, it's necessary to pass a ``Parser`` as the `body` of another ``Parser``: ... ) ... ) >>> result = prs_hess.capture_body(data) - >>> arr = np.column_stack(np.array(_, dtype=float) for _ in result[0]) + >>> arr = np.column_stack([np.array(_, dtype=float) for _ in result[0]]) >>> print(arr[:3, :7]) [[ 0.468819 -0.006771 0.020586 -0.38269 0.017874 -0.05449 -0.044552] [-0.006719 0.022602 -0.016183 0.010997 -0.033397 0.014422 -0.01501 ] [ 0.020559 -0.016184 0.066859 -0.033601 0.014417 -0.072836 0.045825]] -The need for the ``for``/``in`` iteration expression, the ``[0]`` index into ``result``, +The need for the generator expression, the ``[0]`` index into ``result``, and the composition via ``np.column_stack`` arises due to the manner in which ``pent`` returns data from a nested match like this. -See the `documentation `__ for more information. +See the `documentation `__, +in particular `this example `__, +for more information. The grammar of the ``pent`` mini-language is designed to be flexible enough that it should handle essentially all well-formed structured data, and even some data @@ -159,21 +161,22 @@ parsing `this data block `__: ``pip install pent`` +Beta releases available on `PyPI `__: ``pip install pent`` -Full documentation (pending) is hosted at +Full documentation is hosted at `Read The Docs `__. Source on `GitHub `__. Bug reports, -feature requests, and ``Parser`` pattern composition help requests +feature requests, and ``Parser`` construction help requests are welcomed at the `Issues `__ page there. -Copyright (c) Brian Skinn 2018 +Copyright (c) Brian Skinn 2018-2019 License: The MIT License. See `LICENSE.txt `__ for full license terms. + .. |cclib| replace:: ``cclib`` -.. _cclib: https://github.com/cclib/cclib \ No newline at end of file +.. _cclib: https://github.com/cclib/cclib diff --git a/doc/Makefile b/doc/Makefile index 7e8794a..7f02c27 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -14,7 +14,11 @@ help: .PHONY: help Makefile +# sphinx-autobuild target +livehtml: + sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O) + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/make.bat b/doc/make.bat index e37613f..08c539d 100644 --- a/doc/make.bat +++ b/doc/make.bat @@ -26,7 +26,11 @@ if errorlevel 9009 ( exit /b 1 ) -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %2 +if "%1" == "livehtml" ( + sphinx-autobuild %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %2 +) else ( + %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %2 +) goto end :help diff --git a/doc/source/_static/.pin b/doc/source/_static/.pin new file mode 100644 index 0000000..e69de29 diff --git a/doc/source/api.rst b/doc/source/api.rst new file mode 100644 index 0000000..903aa6f --- /dev/null +++ b/doc/source/api.rst @@ -0,0 +1,33 @@ +.. Dump of an API page, until it gets cleaned up + +API (draft page) +================ + +Unstructured API dump, to provide cross-reference targets +for other portions of the docs. + +Any of the objects/attributes/methods documented here may +become private implementation details in future +versions of ``pent``. + + +.. automodule:: pent.parser + :members: + +.. automodule:: pent.token + :members: + +.. automodule:: pent.patterns + :members: + +.. automodule:: pent.enums + :members: + +.. automodule:: pent.errors + :members: + +.. automodule:: pent.thrulist + :members: + +.. automodule:: pent.utils + :members: diff --git a/doc/source/conf.py b/doc/source/conf.py index b8d5334..b57cb77 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -14,6 +14,7 @@ # import os import os.path as osp +import re import sys sys.path.insert(0, osp.abspath(osp.join(os.pardir, os.pardir))) @@ -24,10 +25,12 @@ copyright = '2018, Brian Skinn' author = 'Brian Skinn' +from pent import __version__ + # The short X.Y version -version = '0.1' +version = re.match(r"\d+\.\d+", __version__).group() # The full version, including alpha/beta/rc tags -release = '0.1' +release = __version__ # -- General configuration --------------------------------------------------- @@ -76,6 +79,72 @@ # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' +rst_epilog = """ + +.. |str| replace:: :class:`str` + +.. |list| replace:: :class:`list` + +.. |tuple| replace:: :class:`tuple` + +.. |None| replace:: :obj:`None` + +.. |dict| replace:: :obj:`dict` + +.. |ndarray| replace:: :class:`~numpy.ndarray` + +.. |Parser| replace:: :class:`~pent.parser.Parser` + +.. |Parsers| replace:: :class:`Parsers ` + +.. |Token| replace:: :class:`~pent.token.Token` + +.. |ThruList| replace:: :class:`~pent.thrulist.ThruList` + +.. |capture_body| replace:: :meth:`~pent.parser.Parser.capture_body` + +.. |capture_struct| replace:: :meth:`~pent.parser.Parser.capture_struct` + +.. |license_txt| replace:: LICENSE.txt + +.. _license_txt: https://github.com/bskinn/pent/blob/master/LICENSE.txt + +.. |cour| raw:: html + + + +.. |/cour| raw:: html + + + +.. |br| raw:: html + +
+ +.. |nbsp| raw:: html + +   + +""" + +doctest_global_setup = r"""\ + +from textwrap import dedent + +import numpy as np + +import pent + +def check_pattern(*, pattern, text): + prs = pent.Parser(body=pattern) + print("MATCH" if len(prs.capture_body(text)) > 0 else "NO MATCH", end='\n\n') + +def show_capture(*, pattern, text): + prs = pent.Parser(body=pattern) + print(prs.capture_body(text), end='\n\n') + +""" + # -- Options for HTML output ------------------------------------------------- @@ -168,9 +237,14 @@ # -- Options for intersphinx extension --------------------------------------- # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'python': ('https://docs.python.org/', 'isphx/objects_python.inv')} +intersphinx_mapping = {'python': ('https://docs.python.org/3/', (None, 'isphx/objects_python.inv')), + 'numpy': ('https://docs.scipy.org/doc/numpy/', (None, 'isphx/objects_numpy.inv')),} # -- Options for todo extension ---------------------------------------------- # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True + +# -- Options for sphinx_issues ---- + +issues_github_path = "bskinn/pent" \ No newline at end of file diff --git a/doc/source/grammar.rst b/doc/source/grammar.rst new file mode 100644 index 0000000..57be750 --- /dev/null +++ b/doc/source/grammar.rst @@ -0,0 +1,56 @@ +.. pent mini-language grammar + +pent Mini-Language Grammar +========================== + +As discussed :doc:`here `, a ``pent`` |Parser| +is constructed by passing it *patterns* composed of *tokens*. The grammar below +specifies what constitutes a valid ``pent`` *token*. + +For completeness, even though the +:ref:`optional-line pattern flag ` +is called a *flag* and not a *token*, internally ``pent`` parses this flag +as though it were a token, and thus it is included here. + +This grammar is expressed in an approximation of +`extended Backus-Naur form `__. +Content in double quotes represents a literal string, the pipe character indicates alternatives, +square brackets indicate *optional* token flags, and parentheses indicate *required* token flags. + +**Grammar** + +.. code-block:: none + + token ::= optional_line_flag | content_token + + optional_line_flag ::= "?" + content_token ::= any_token | literal_token | misc_token | number_token + + any_token ::= "~"[capture] + literal_token ::= "@"[space_after][capture](quantity)(literal_content) + misc_token ::= "&"[space_after][capture](quantity) + number_token ::= "#"[space_after][capture](quantity)(sign)(num_type) + + space_after ::= optional_space_after | no_space_after + optional_space_after ::= "o" + no_space_after ::= "x" + + capture ::= "!" + + quantity ::= match_one | match_one_or_more + match_one ::= "." + match_one_or_more ::= "+" + + sign ::= any_sign | positive_sign | negative_sign + any_sign ::= "." + positive_sign ::= "+" + negative_sign ::= "-" + + num_type ::= integer | decimal | sci_not | float | general + integer ::= "i" + decimal ::= "d" + sci_not ::= "s" + float ::= "f" + general ::= "g" + + diff --git a/doc/source/index.rst b/doc/source/index.rst index 5f0cfbf..c61cbeb 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -3,15 +3,99 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to pent's documentation! -================================ +Welcome to pent! +================ -*Docs pending...* +A common frustration in data analysis is software tooling +that *only* generates its output in human-readable fashion. Thus, +even if there is visible structure to the data, that structure is +embedded in a format that can be awkward to parse. +Take the following toy data: + +.. doctest:: toy + + >>> text = """{lots of content} + ... + ... $data1 + ... 0 0.000 + ... 1 -3.853 + ... 2 1.219 + ... + ... $data2 + ... 0 3.142 + ... 1 2.718 + ... 2 6.022 + ... + ... {lots more content}""" + +Say it's needed to extract the list of decimal values in +|cour|\ $data1\ |/cour|, *without* the accompanying integers. +Further, say that in any given particular output file, +this list of values can be of *any* length. + +One could write a line-by-line search to parse out the values, +but that's a slow way to go about it if there are many such data blocks +that need to be extracted. + +Regex is a pretty natural tool to use here, but writing the regex +to retrieve these values is a non-trivial task: because of the way +regex capture groups work, you really have to write *two* regexes. +The first regex captures the whole chunk of text of interest, +and the second searches within that chunk to capture the values +from the individual lines. + +``pent`` **writes all this regex for you.** + +All you have to do is provide ``pent`` with the structure of +the text using its custom mini-language, +including which parts should be captured for output, +and it will scrape the data directly from the text: + +.. doctest:: toy + + >>> prs = pent.Parser( + ... head="@.$data1", + ... body="#.+i #!..d", + ... ) + >>> prs.capture_body(text) + [[['0.000'], ['-3.853'], ['1.219']]] + +This is just one example of ``pent``\ 's parsing +capabilities---it's an extremely flexible tool, which can retrieve +just about anything you want from just about any +surrounding text. + +Usage instructions for ``pent`` are provided in the +:doc:`tutorial `, broken up into +(1) an explanation of the +:doc:`basics ` of the syntax +and (2) exposition of a number of +(more-)realistic :doc:`examples `. +For those so inclined, a formal grammar of +the mini-language is also :doc:`provided `. + +What ``pent`` is not +-------------------- + +``pent`` is **not** well suited for parsing data with an extensively +nested or recursive structure, especially if that structure +is defined by clear rules. Have JSON, XML, or YAML? There are other +libraries specifically made for those formats, and you should use them. +``pent`` ultimately is just a fancy regex generator, and thus it +carries the same functional constraints. If you build a |Parser| +that is too complex, it *will* run until approximately +the heat death of the universe! + + +**Contents:** .. toctree:: - :maxdepth: 2 - :caption: Contents: + :maxdepth: 1 + + tutorial + grammar + api diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst new file mode 100644 index 0000000..4a3113d --- /dev/null +++ b/doc/source/tutorial.rst @@ -0,0 +1,32 @@ +.. pent parser tutorial + +pent Parser Tutorial +==================== + +There is almost always more than one way to construct a ``pent`` |Parser| +to capture a given dataset. Sometimes, if the data +format is complex or contains irrelevant content interspersed with the +data of interest, significant pre- or post-processing may be required. As well, +it's important to inspect your starting data carefully, often by +loading it into a Python string, to be sure there aren't, say, a bunch of +unprintable characters floating around and fouling the regex matches. + +This tutorial starts by describing the basic structure of +the semantic components of ``pent``'s parsing model: +*tokens*, *patterns*, and |Parsers|. +It then lays out some approaches to constructing |Parsers| +for realistic datasets, with the goal of enabling new users +to get quickly up to speed +building their own |Parsers|. + +For a formal description of the +grammar of the tokens used herein, see the +:doc:`grammar`. + + +.. toctree:: + :maxdepth: 2 + + tutorial/basics + tutorial/examples + diff --git a/doc/source/tutorial/basics.rst b/doc/source/tutorial/basics.rst new file mode 100644 index 0000000..55ab572 --- /dev/null +++ b/doc/source/tutorial/basics.rst @@ -0,0 +1,29 @@ +.. ~Glossary of terms, plus some explanation + +Basic Usage +=========== + +``pent`` searches text in a line-by-line fashion, +where a line of text is delimited by the start/end +of the string, and/or by newline(s). + +Each line of text to be matched by ``pent`` is represented +by a *pattern*, passed into a |Parser|. +Each *pattern* is a string composed of zero or more whitespace-separated *tokens*, +which define in a structured way what the overall *pattern* should match. +Both *patterns* **and** *tokens* can also include *flags*, +which modify the semantics of how they are processed. + +At present, whitespace is hardcoded to include only spaces and +tab characters (|cour|\ \\t\ |/cour|). Various options for user-configurable +whitespace definition are planned (:issue:`26`). + + +.. toctree:: + :maxdepth: 1 + + Tokens + Patterns + Parsers + + diff --git a/doc/source/tutorial/basics/parsers.rst b/doc/source/tutorial/basics/parsers.rst new file mode 100644 index 0000000..3fd600a --- /dev/null +++ b/doc/source/tutorial/basics/parsers.rst @@ -0,0 +1,273 @@ +.. Introducing the Parser semantics + +Basic Usage: Parsers +==================== + +The |Parser| is the main user-facing interface to ``pent``, +where the patterns matching the data of interest +are defined. |Parsers| are created with three arguments, +*head*, *body*, and *tail*. All |Parsers| must have a *body*; +*head* and *tail* are optional. + +A section of text matched by +a given |Parser| will have the following structure: + +- If *head* is defined, it will be matched exactly once, + and its content must immediately precede the *body* content. + +- *body* will be matched one or more times. + +- If *tail* is defined, it will be matched exactly once, + and its content must immediately follow the *body* content. + +Each of *head*, *body*, and *tail* can be one of three things: + +1. A single ``pent`` :doc:`pattern `, + matching a single line of text + +2. An ordered iterable (|tuple|, |list|, etc.) of patterns, + matching a number of lines of text equal to the length + of the iterable + +3. A |Parser|, matching its entire contents + +The syntax and matching structure of |Parsers| using +these three kinds of arguments are illustrated below +using trivial examples. Application of ``pent`` to +more-realistic situations is demonstrated in the +:doc:`Examples section ` of the tutorial. + +In the below examples, most illustrations are of the use +of *head*, rather than *tail*. However, the principles +apply equally well to both. + + +Matching with Single Patterns +----------------------------- + +The simplest possible |Parser| only has *body* defined, +containing a single ``pent`` +:doc:`pattern `: + +.. doctest:: single_patterns + + >>> prs = pent.Parser(body="@!.bar") + >>> text = """foo + ... bar + ... baz""" + >>> prs.capture_body(text) + [[['bar']]] + +As noted, *body* will match multiple times in a row: + +.. doctest:: single_patterns + + >>> text = """foo + ... bar + ... bar + ... bar + ... baz""" + >>> prs.capture_body(text) + [[['bar'], ['bar'], ['bar']]] + +Multiple occurrences of *body* in the text will match independently: + +.. doctest:: single_patterns + + >>> text = """foo + ... bar + ... baz + ... bar + ... baz""" + >>> prs.capture_body(text) + [[['bar']], [['bar']]] + +If only that first |cour|\ bar\ |/cour| is of interest, +the |Parser| match can be constrained with a *head*: + +.. doctest:: single_patterns + + >>> prs_head = pent.Parser(head="@.foo", body="@!.bar") + >>> prs_head.capture_body(text) + [[['bar']]] + +Adding just a *tail* doesn't really help, since +|cour|\ baz\ |/cour| follows both instances of +|cour|\ bar\ |/cour|: + +.. doctest:: single_patterns + + >>> prs_tail = pent.Parser(body="@!.bar", tail="@.baz") + >>> prs_tail.capture_body(text) + [[['bar']], [['bar']]] + + +Matching with Iterables of Patterns +----------------------------------- + +Sometimes data is structured in such a way that +it's necessary to associate more than one line of text +with a given portion of a |Parser|. This is most +common with *head* and *tail*, but it can occur with +*body* as well. These situations are addressed +by using iterables of patterns when +instantiating a |Parser|. + +The following is a situation where the header +portion of the data contains two lines, +one being a string label and the other being a series +of integers, and it's +important to capture only the "wanted" data block: + +.. doctest:: iterables + + >>> text = """WANTED_DATA + ... 1 2 3 + ... 1.5 2.1 1.1 + ... + ... UNWANTED_DATA + ... 1 2 3 + ... 0.1 0.4 0.2 + ... """ + >>> pent.Parser( + ... head=("@.WANTED_DATA", "#++i"), + ... body="#!++d" + ... ).capture_body(text) + [[['1.5', '2.1', '1.1']]] + +Note that even though |cour|\ WANTED_DATA\ |/cour| appears in the header +line of the 'unwanted' data block, since the +|cour|\ @.WANTED_DATA\ |/cour| token does not match +the *complete* contents of |cour|\ UNWANTED_DATA\ |/cour|, +the |Parser| does not match that second block. + +If *head* were left out, or defined just to match the +rows of integers, both datasets would be retrieved: + +.. doctest:: iterables + + >>> pent.Parser(head="#++i", body="#!++d").capture_body(text) + [[['1.5', '2.1', '1.1']], [['0.1', '0.4', '0.2']]] + +Situations calling for passing an iterable into *body* are less common, +but can occur if there is a strictly repeating, cyclic pattern to the +text to be parsed: + +.. doctest:: iterables + + >>> text_good = """DATA + ... foo + ... bar + ... foo + ... bar + ... foo + ... bar""" + >>> prs = pent.Parser( + ... head="@.DATA", + ... body=("@!.foo", "@!.bar") + ... ) + >>> prs.capture_body(text_good) + [[['foo', 'bar'], ['foo', 'bar'], ['foo', 'bar']]] + +Note in the |cour|\ .capture_body()\ |/cour| output that even though +each |cour|\ foo\ |/cour| and |cour|\ bar\ |/cour| appear on separate +lines in the text, because the capture of each pair is defined +as the *body* of a single |Parser|, they end up being treated as though +they had been on the same line. Another example of this behavior +can be found in :doc:`this tutorial example `. + +If the lines of body text are not strictly cyclic-repeating, +this approach won't work: + +.. doctest:: iterables + + >>> text_bad = """DATA + ... foo + ... bar + ... + ... foo + ... bar""" + >>> prs.capture_body(text_bad) + [[['foo', 'bar']]] + +There are other approaches that can handle such situations, +such as the +:ref:`optional-line pattern flag `: + +.. doctest:: iterables + + >>> pent.Parser( + ... head="? @.DATA", + ... body=("@!.foo", "@!.bar") + ... ).capture_body(text_bad) + [[['foo', 'bar']], [['foo', 'bar']]] + + +Matching with a Nested |Parser| +------------------------------- + +For data with more complex internal structure, often the best +way to match it is to pass a |Parser| to one or more of +*head*, *body*, or *tail*. + +In situations where the header or footer content has a variable +number of lines that all match the same pattern, passing +a |Parser| is often the most concise approach, as it +exploits the implicit matching of one-or-more lines by +the *body* of that internal |Parser|: + +.. doctest:: parsers + + >>> text_head = """foo + ... 1 2 3 + ... bar + ... bar + ... + ... foo + ... 1 2 3 + ... 4 5 6 7 8 + ... 9 10 + ... bar + ... bar + ... bar""" + >>> prs_head = pent.Parser( + ... head=pent.Parser( + ... head="@.foo", + ... body="#++i", + ... ), + ... body="@!.bar", + ... ) + >>> prs_head.capture_body(text_head) + [[['bar'], ['bar']], [['bar'], ['bar'], ['bar']]] + +Another common use of an internal |Parser| is when +the main data content itself has a header/body/footer structure, +but it is also necessary to specify an overall header for the +data in order to avoid capturing multiple times within the +broader text: + +.. doctest:: parsers + + >>> text_body = """WANTED + ... foo + ... bar + ... bar + ... + ... UNWANTED + ... foo + ... bar + ... bar + ... bar + ... bar""" + >>> prs_body = pent.Parser( + ... head="@.WANTED", + ... body=pent.Parser( + ... head="@.foo", + ... body="@!.bar", + ... ), + ... ) + >>> prs_body.capture_body(text_body) + [[[['bar'], ['bar']]]] + +A clearer description of this approach is provided in +:doc:`this tutorial example `. diff --git a/doc/source/tutorial/basics/patterns.rst b/doc/source/tutorial/basics/patterns.rst new file mode 100644 index 0000000..3265311 --- /dev/null +++ b/doc/source/tutorial/basics/patterns.rst @@ -0,0 +1,184 @@ +.. Pattern-level semantics + +Basic Usage: Patterns +===================== + +A ``pent`` *pattern* is a series of whitespace-delimited +:doc:`tokens ` that represents **all** non-whitespace +content on a given line of text. + +A blank line---one that is empty, or contains only +whitespace---can be matched with an empty pattern string: + +.. doctest:: blank + + >>> check_pattern(pattern="", text="") + MATCH + + >>> check_pattern(pattern="", text=" ") + MATCH + + >>> check_pattern(pattern="", text=" \t ") + MATCH + + +If a line contains one piece of non-whitespace text, +a single token will suffice to match the whole line: + +.. doctest:: one_piece + + >>> check_pattern(pattern="&.", text="foo") + MATCH + + >>> check_pattern(pattern="&.", text=" foo") + MATCH + + >>> check_pattern(pattern="#..i", text="-5") + MATCH + + >>> check_pattern(pattern="#..i", text=" 50000 ") + MATCH + + >>> check_pattern(pattern="#..f", text="2") # Wrong number type + NO MATCH + + >>> check_pattern(pattern="#.-i", text="2") # Wrong number sign + NO MATCH + + >>> check_pattern(pattern="", text="42") # Line is not blank + NO MATCH + + +If a line contains more than one piece of non-whitespace text, +**all pieces** must be matched by a token in the pattern: + +.. doctest:: multi_pieces + + >>> check_pattern(pattern="&+", text="foo bar baz") # One-or-more gets all three + MATCH + + >>> check_pattern(pattern="&. &.", text="foo bar baz") # Only 2/3 words matched + NO MATCH + + >>> check_pattern(pattern="&. #..i", text="foo 42") + MATCH + + >>> check_pattern(pattern="&+ #..i", text="foo bar baz 42") + MATCH + + >>> check_pattern(pattern="#+.i", text="-2 -1 0 1 2") + MATCH + + >>> check_pattern(pattern="#+.i", text="-2 -1 foo 1 2") # 'foo' is not an int + NO MATCH + + >>> check_pattern(pattern="#+.i &. #+.i", text="-2 -1 foo 1 2") + MATCH + + +Be careful when using "|cour|\ ~\ |/cour|" and +"|\cour|\ &+\ |/cour|", as they **may** match +more aggressively than expected: + +.. doctest:: aggressive_matching + + >>> check_pattern(pattern="~ #+.i", text="foo bar 42 34") + MATCH + + >>> show_capture(pattern="~! #+.i", text="foo bar 42 34") + [[['foo', 'bar']]] + + >>> check_pattern(pattern="&+ #+.i", text="foo bar 42 34") + MATCH + + >>> show_capture(pattern="&!+ #+.i", text="foo bar 42 34") + [[['foo', 'bar', '42']]] + + >>> check_pattern(pattern="&+ #+.i", text="foo 42 bar 34") + MATCH + + >>> show_capture(pattern="&!+ #+.i", text="foo 42 bar 34") + [[['foo', '42', 'bar']]] + + + +Punctuation will foul matches unless explicitly accounted for: + +.. doctest:: punctuation + + >>> check_pattern(pattern="#+.i", text="1 2 ---- 3 4") + NO MATCH + + >>> check_pattern(pattern="#+.i &. #+.i", text="1 2 ---- 3 4") + MATCH + + + +In situations where punctuation is directly adjacent to the content +to be captured, the :ref:`space-after flags ` +must be used to modify ``pent``'s expectations for whitespace: + +.. doctest:: whitespace + + >>> check_pattern(pattern="~ #..d @..", text="The value is 3.1415.") # No space between number and '.' + NO MATCH + + >>> check_pattern(pattern="~ #x..d @..", text="The value is 3.1415.") + MATCH + + + +In situations where some initial content will definitely appear on a line, +but some additional trailing content *may or may not* appear at the end of the line, +it's important to use one of the space-after modifier flags in order for +``pent`` to find a match when the trailing content is absent. +This is because the default required +trailing whitespace will (naturally) *require* whitespace to be present +between the end of the matched content and the end of the line, +and if EOL immediately follows the content the pattern match will fail, +since the required whitespace is absent: + +.. doctest:: eol_optional + + >>> check_pattern(pattern="&. #.+i ~", text="always 42 sometimes") + MATCH + + >>> check_pattern(pattern="&. #.+i ~", text="always 42") + NO MATCH + + >>> check_pattern(pattern="&. #.+i ~", text="always 42 ") + MATCH + + >>> check_pattern(pattern="&. #x.+i ~", text="always 42") + MATCH + + >>> check_pattern(pattern="&. #x.+i ~", text="always 42 sometimes") + MATCH + + + +.. _tutorial-basics-patterns-optionallineflag: + +Optional Line Flag: |cour|\ ?\ |/cour| +-------------------------------------- + +In some cases, an entire line of text will be present in some occurrences +of a desired |Parser| match with a block of text, but absent in others. +To accommodate such situations, ``pent`` recognizes an 'optional-line flag' in a pattern. +This flag is a sole "|cour|\ ?\ |/cour|", occurring as the first "token" +in the pattern. Inclusion of this flag will cause the pattern +to match in the following three cases: + +1. A line is present that completely matches the optional pattern + (per usual behavior). + +2. A blank line (no non-whitespace content) is present where the + optional pattern would match. + +3. **NO** line is present where the optional pattern would match. + +It is difficult to construct meaningful examples of this behavior +without using a full |Parser| construction; as such, see +:ref:`this tutorial page ` +for more details. + diff --git a/doc/source/tutorial/basics/tokens.rst b/doc/source/tutorial/basics/tokens.rst new file mode 100644 index 0000000..ece7813 --- /dev/null +++ b/doc/source/tutorial/basics/tokens.rst @@ -0,0 +1,221 @@ +.. Token-level semantics + +Basic Usage: Tokens +=================== + +``pent`` understands four kinds of tokens, which match varying types of content. +One is an :ref:`'any' token `, +which matches an arbitrary span of whitespace and/or +non-whitespace content. The other three types are intended to match specific kinds of +content within the line of text that are often, but not always, +separated from surrounding content by whitespace. + +All four kinds of tokens accept a :ref:`flag ` +that instructs the encapsulating +|Parser| to capture the content matching the token for output. +A subset of the tokens accepts a :ref:`flag ` +that alters how the |Parser| handles the presence or absence of whitespace +following the content matching the token. + + +.. _tutorial-basics-tokens-anytoken: + +The 'Any' Token: |cour|\ ~\ |/cour| +----------------------------------- + +The 'any' token will match **anything**, including a completely blank line. +It behaves essentially the same as "|cour|\ .*\ |/cour|" in regex. + +Currently, the 'any' token only accepts the +:ref:`'capture' flag ` +(becoming "|cour|\ ~!\ |/cour|"). Addition of support for the +:ref:`'space-after' flags ` +is planned (:issue:`78`). + +Note that any content matched by a capturing 'any' token will be +split at whitespace in |Parser| output. + + +.. _tutorial-basics-tokens-misctoken: + +The 'Misc' Token: |cour|\ &\ |/cour| +------------------------------------ + +The 'misc' token matches any sequence of non-whitespace characters. +Its uses are similar to the :ref:`'any' token `, +except that its match +is confined to a single whitespace-delimited piece of content. +It is mainly intended for use on non-numerical data +whose content is not constant, and thus +the :ref:`'literal' token ` cannot be used. + +The 'misc' token has one required argument, indicating whether +it should match exactly one piece of content +(|cour|\ &.\ |/cour|) or one-or-more pieces of content +(|cour|\ &+\ |/cour|). When matching one-or-more, +the 'misc' token interleaves *required* whitespace +between each reptition. + +At this time, the functional difference between +"|cour|\ ~\ |/cour|" and "|cour|\ &+\ |/cour|" is minimal. + +The 'misc' token accepts both the +:ref:`capture flag ` +and the :ref:`space-after ` modifier flags. + + +.. _tutorial-basics-tokens-literaltoken: + +The 'Literal' Token: |cour|\ @\ |/cour| +--------------------------------------- + +The 'literal' token matches an *exact* sequence of one or more +whitespace-delimited characters, which is provided as a required argument +in the token definition. + +Similar to the :ref:`'misc' token `, +the 'literal' token also has +the quantity specifier as a required argument: +either "|cour|\ @.\ |/cour|" for exactly one match +or "|cour|\ @+\ |/cour|" for one-or-more matches. + +The argument for the string to be matched follows the +quantity argument. Thus, to match the text +|cour|\ foo\ |/cour| exactly once a suitable token +might be "|cour|\ @.foo\ |/cour|". + +In the situation where it's needed to match a literal string +containing a space, the entire token can be enclosed in +quotes: "|cour|\ '@.this has spaces'\ |/cour|". + +The 'literal' token differs from the +:ref:`'misc' ` and +:ref:`'number' ` tokens +in that when the one-or-more argument is used, it **prohibits** +whitespace between the repetitions. +This allows, e.g., a long sequence of hyphens to be represented +by a token like "|cour|\ @+-\ |/cour|". Similarly, a long +sequence of alternating hyphens and spaces could be represented +by "|cour|\ '@+- '\ |/cour|". + +The 'literal' token accepts both the +:ref:`capture flag ` +and the :ref:`space-after ` modifier flags. + + +.. _tutorial-basics-tokens-numbertoken: + +The 'Number' Token: |cour|\ #\ |/cour| +-------------------------------------- + +The 'number' token allows for selectively matching numbers of varying +types in the text being parsed; in particular, matches can be constrained +by sign (positive, negative, or either) or by format +(integer, decimal, or scientific notation; or, combinations of these). + +The 'number' token takes three required, single-character arguments: + +1. Quantity: |br| + |cour|\ #.\ |/cour| for exactly one, or |br| + |cour|\ #+\ |/cour| for one-or-more. |br| + |nbsp| + +2. Sign: |br| + |cour|\ #[.+]+\ |/cour| for positive, |br| + |cour|\ #[.+]-\ |/cour| for negative, or |br| + |cour|\ #[.+].\ |/cour| for either sign. |br| + |nbsp| + +3. Number Format: |br| + |cour|\ #[.+][.-+]i\ |/cour| for integer, |br| + |cour|\ #[.+][.-+]d\ |/cour| for decimal, |br| + |cour|\ #[.+][.-+]s\ |/cour| for scientific notation, |br| + |cour|\ #[.+][.-+]f\ |/cour| for float (decimal or scinot) |br| + |cour|\ #[.+][.-+]g\ |/cour| for general (integer or float). + +The ability to specify different types of number formatting was implemented +for this token because it is often the case that numbers printed +in different formats have different semantic significance, +and it's thus useful to be able to filter/capture based on that format. +:ref:`This example ` +illustrates a simplified case of this. + +As with the :ref:`'misc' token `, +when matching in one-or-more quantity mode, +the 'number' token interleaves *required* whitespace between each reptition. + +The 'number' token accepts both the +:ref:`capture flag ` +and the :ref:`space-after ` modifier flags. + + +.. _tutorial=defs-tokens-flags: + +Token Flags +----------- + +Currently, two types of flags can be passed to tokens: +:ref:`capture flag ` +and the :ref:`space-after ` modifier flags. + +If both flags are used in a given token, the space-after modifier +flag must **precede** the capture flag. + + +.. _tutorial-basics-tokens-captureflag: + +Capture Flag: |cour|\ !\ |/cour| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In most cases, not all of the data in a block of text is of interest +for downstream processing. Thus, ``pent`` provides the token-level +'capture' flag, "|cour|\ !\ |/cour|", which marks +the content of that token for inclusion in the output of +:meth:`~pent.parser.Parser.capture_body` and +:meth:`~pent.parser.Parser.capture_struct`. +The 'capture' flag is an integral part of all of the +:doc:`tutorial examples `. + + +.. _tutorial-basics-tokens-spaceflags: + +Space-After Flags: |cour|\ o\ |/cour| and |cour|\ x\ |/cour| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With no space-after flag provided, all tokens *REQUIRE* the presence +of trailing whitespace (or EOL) +in order to match. This is because most content is anticipated to be +whitespace-delineated, and thus this default leads to +more concise |Parser| definitions. + +However, there are situations where changing this behavior is +useful for defining a well-targeted |Parser|, and some where +changing it is necessary in order to compose +a functional |Parser| at all. + +As an example, take the following line of text: + +.. code:: + + The foo is in the foo. + +The token "|cour|\ @.foo\ |/cour|" +would match the first occurrence of the word "foo", +because it has whitespace after it, but it would +*not* match the second occurrence, since it is +immediately followed by a period. + +In order to match both occurrences, the +'optional trailing whitespace flag', +"|cour|\ o\ |/cour|", could be added, leading +to the token "|cour|\ @o.foo\ |/cour|". + +If it were desired only to match the second occurrence, +the 'prohibited trailing whitespace flag', +"|cour|\ x\ |/cour|", could be added, +yielding "|cour|\ @x.foo\ |/cour|". + +:doc:`This tutorial example ` +provides further illustration of the use of these flags +in more-realistic situations. + diff --git a/doc/source/tutorial/examples.rst b/doc/source/tutorial/examples.rst new file mode 100644 index 0000000..b8d1b28 --- /dev/null +++ b/doc/source/tutorial/examples.rst @@ -0,0 +1,24 @@ +.. Tutorial - examples subsection + +Examples +======== + +This section of the tutorial contains examples of applications +of ``pent`` to parsing of "real" (or, at least "real-like") datasets. + +.. toctree:: + :maxdepth: 1 + + examples/single_parser + examples/nested_parsers + examples/misc + examples/post_process + examples/internal_spaces + examples/optional_line + examples/space_after + examples/any_eol + examples/data_cleanup + examples/generated_regex + +*\* Incomplete* + diff --git a/doc/source/tutorial/examples/any_eol.rst b/doc/source/tutorial/examples/any_eol.rst new file mode 100644 index 0000000..10ea8a4 --- /dev/null +++ b/doc/source/tutorial/examples/any_eol.rst @@ -0,0 +1,6 @@ +.. Admonition about 'any' tokens @ EOL + +\*'Any' Tokens at EOL +===================== + +**TODO** per :issue:`45` diff --git a/doc/source/tutorial/examples/data_cleanup.rst b/doc/source/tutorial/examples/data_cleanup.rst new file mode 100644 index 0000000..64fddfa --- /dev/null +++ b/doc/source/tutorial/examples/data_cleanup.rst @@ -0,0 +1,6 @@ +.. Example where data cleanup is needed + +\*Pre-Processing/Data Cleanup Example +===================================== + +*pending* diff --git a/doc/source/tutorial/examples/generated_regex.rst b/doc/source/tutorial/examples/generated_regex.rst new file mode 100644 index 0000000..d7396e3 --- /dev/null +++ b/doc/source/tutorial/examples/generated_regex.rst @@ -0,0 +1,6 @@ +.. Showing some of the regex + +\*Examples of |Parser|\ -Generated Regex +======================================== + +*pending* diff --git a/doc/source/tutorial/examples/internal_spaces.rst b/doc/source/tutorial/examples/internal_spaces.rst new file mode 100644 index 0000000..8f0f50f --- /dev/null +++ b/doc/source/tutorial/examples/internal_spaces.rst @@ -0,0 +1,7 @@ +.. Demo of internal spaces differences + +\*Internal Spaces in One-Or-More Matches +======================================== + +*Illustration of how misc/number and literal +token types handle them differently.* \ No newline at end of file diff --git a/doc/source/tutorial/examples/misc.rst b/doc/source/tutorial/examples/misc.rst new file mode 100644 index 0000000..c040d27 --- /dev/null +++ b/doc/source/tutorial/examples/misc.rst @@ -0,0 +1,100 @@ +.. Misc token + +The Misc Token +============== + +Sometimes, data is laid out in text in a fashion +where it cannot be matched using only numerical values. +Either some elements of the data of interest are themselves +non-numeric, or there are non-numeric portions of content +interspersed with the numeric data of interest. +``pent`` provides the "misc" token +(|cour|\ &\ |/cour|) to handle these kinds of situations. + +Take the following data, which is an example of the +`XYZ format `__ +for representing the atomic +coordinates of a chemical system: + +.. doctest:: misc + + >>> text_xyz = dedent(""" + ... 5 + ... Coordinates from MeCl2F_2 + ... C -3.081564 2.283942 0.044943 + ... Cl -1.303141 2.255173 0.064645 + ... Cl -3.706406 3.411601 -1.180577 + ... F -3.541771 2.647036 1.270358 + ... H -3.439068 1.277858 -0.199370 + ... """) + +In this case, pretty much everything in the text block is of +interest. The first number indicates how many atoms are present +(useful for cross-checking the data import), the line of +text is an arbitrary string describing the chemical system, +and the data block provides the atomic symbol of each atom and +its xyz position in space. + +.. _tutorial-examples-misc-splittingofany: + +The following |Parser| will enable capture of the entire contents +of the string: + +.. doctest:: misc + + >>> prs_xyz = pent.Parser( + ... head=("#!..i", "~!"), + ... body="&!. #!+.d", + ... ) + +The atomic symbols and coordinates are most easily retrieved +with |capture_body|: + +.. doctest:: misc + + >>> data_atoms = prs_xyz.capture_body(text_xyz) + >>> data_atoms + [[['C', '-3.081564', '2.283942', '0.044943'], ['Cl', '-1.303141', '2.255173', '0.064645'], ['Cl', '-3.706406', '3.411601', '-1.180577'], ['F', '-3.541771', '2.647036', '1.270358'], ['H', '-3.439068', '1.277858', '-0.199370']]] + +The atom count and description can be retrieved with +|capture_struct|: + +.. doctest:: misc + + >>> data_struct = prs_xyz.capture_struct(text_xyz) + >>> data_struct[pent.ParserField.Head][0] + ['5', 'Coordinates', 'from', 'MeCl2F_2'] + +Unlike in *body*, where two-dimensional structure is inferred in captured data, +in *head* and *tail* all captures are returned as elements of a single, flat |list|. + +Currently, it is not possible to avoid the splitting of *all* captured content +at whitespace, even it it was captured from a single 'any' or 'literal' token. +:issue:`26` and/or :issue:`62` are planned and will provide mechanism(s) +to change this behavior. + +----- + +As an aside, in this particular case the 'misc' token was not strictly +necessary in the *body*, as the capturing 'any' token +(|cour|\ ~!\ |/cour|) would also have worked: + +.. doctest:: misc + + >>> prs_any = pent.Parser( + ... head=("#.+i", "~"), + ... body="~! #!+.d", + ... ) + >>> prs_any.capture_body(text_xyz) + [[['C', '-3.081564', '2.283942', '0.044943'], ['Cl', '-1.303141', '2.255173', '0.064645'], ['Cl', '-3.706406', '3.411601', '-1.180577'], ['F', '-3.541771', '2.647036', '1.270358'], ['H', '-3.439068', '1.277858', '-0.199370']]] + +However, there are situations where the ability +of the 'misc' token to match +only a single, arbitrary piece of whitespace-delimited +content is useful in order to narrow the specificity of +the |Parser| match. + + +Another example of the use of the 'misc' token is given +at :doc:`post_process`. + diff --git a/doc/source/tutorial/examples/nested_parsers.rst b/doc/source/tutorial/examples/nested_parsers.rst new file mode 100644 index 0000000..b4d7bed --- /dev/null +++ b/doc/source/tutorial/examples/nested_parsers.rst @@ -0,0 +1,170 @@ +.. Capturing with nested Parsers + +Capturing with Nested |Parser|\ s +================================= + +``pent`` is also able to parse and capture higher-dimensional data +stored as free text. Take the following data string: + +.. doctest:: orca_hess + + >>> text = dedent("""\ + ... $hessian + ... 4 + ... 0 1 + ... 0 0.473532 0.004379 + ... 1 0.004785 0.028807 + ... 2 0.004785 -0.022335 + ... 3 -0.418007 0.008333 + ... 2 3 + ... 0 0.004379 -0.416666 + ... 1 -0.022335 0.008067 + ... 2 0.028807 0.008067 + ... 3 0.008333 0.420926 + ... """) + +`text` represents a 4x4 matrix, with the first two columns printed in one section, +and the second two columns printed in a separate, following section. +Each row and column is marked with its respective index. +In order to import this data successfully, the *body* of the main +|Parser| will have to be set to a different, inner |Parser|. + + +Defining the Inner |Parser| +--------------------------- + +Each section of data columns starts with a row containing only positive integers, +which does not need to be captured. After that leading row are +multiple rows with data, each of which leads with a single +positive integer, followed by decimal-format data of any sign: + +.. doctest:: orca_hess + + >>> text_inner = dedent("""\ + ... 0 1 + ... 0 0.473532 0.004379 + ... 1 0.004785 0.028807 + ... 2 0.004785 -0.022335 + ... 3 -0.418007 0.008333 + ... """) + +One way to construct a |Parser| for this internal block is as follows: + +.. doctest:: orca_hess + + >>> prs_inner = pent.Parser( + ... head="#++i", + ... body="#.+i #!+.d", + ... ) + >>> prs_inner.capture_body(text_inner) + [[['0.473532', '0.004379'], ['0.004785', '0.028807'], ['0.004785', '-0.022335'], ['-0.418007', '0.008333']]] + +Note that even though the multiple decimal values in each row of the data block +were matched by the single "|cour|\ #!+.d\ |/cour|" token in *body*, +they were reported as separate values in the output. +As currently implemented, ``pent`` will **always** split captured content +at any internal whitespace; a further example of this with the 'any' token +can be seen :ref:`here `. + +Defining the Outer |Parser| +--------------------------- + +The outer |Parser| then makes use of the inner |Parser| as its *body*, +with the two header lines defined in *head*: + +.. doctest:: orca_hess + + >>> prs_outer = pent.Parser( + ... head=("@.$hessian", "#.+i"), + ... body=prs_inner, + ... ) + >>> data = prs_outer.capture_body(text) + >>> data + [[[['0.473532', '0.004379'], ['0.004785', '0.028807'], ['0.004785', '-0.022335'], ['-0.418007', '0.008333']], [['0.004379', '-0.416666'], ['-0.022335', '0.008067'], ['0.028807', '0.008067'], ['0.008333', '0.420926']]]] + + +Structure of the Returned *data* +-------------------------------- + +The structure of the list returned by |capture_body| nests four levels deep: + +.. doctest:: orca_hess + + >>> arr = np.asarray(data, dtype=float) + >>> arr.shape + (1, 2, 4, 2) + + +This is because: + +1. Each block of data is returned as a matrix (adds two levels); + +2. The *body* of *prs_outer* is a |Parser| (adds one level); and + +3. The |capture_body| method wraps everything in a list (adds one level). + +So, working from left to right, the |cour|\ (1, 2, 4, 2)\ |/cour| +shape of the data arises because: + +1. The overall *prs_outer* matched **1 time**; + +2. The inner *prs_inner*, as the *body* of *prs_outer*, matched **2 times**; and + +3. Both blocks of data matched by *prs_inner* have **4 rows** and **2 columns** + + +Reassembling the Full 4x4 Matrix +-------------------------------- + +In cases like this, ``numpy``'s :func:`~numpy.column_stack` provides +a simple way to reassemble the full 4x4 matrix of data, though +it is necessary to convert each matrix to an |ndarray| separately: + +.. doctest:: orca_hess + + >>> np.column_stack([np.asarray(block, dtype=float) for block in data[0]]) + array([[ 0.473532, 0.004379, 0.004379, -0.416666], + [ 0.004785, 0.028807, -0.022335, 0.008067], + [ 0.004785, -0.022335, 0.028807, 0.008067], + [-0.418007, 0.008333, 0.008333, 0.420926]]) + +`data[0]` is used instead of `data` in the generator expression +so that the two inner 4x2 blocks of data are yielded separately to :func:`~numpy.asarray`. + +Coping with Mismatched Data Block Sizes +--------------------------------------- + +Nothing guarantees that the data in a chunk of text will have properly matched +internal dimensions, however. ``pent`` will still import the data, but +it may not be possible to pull it directly into a ``numpy`` array +as was done above: + +.. doctest:: orca_hess + + >>> text2 = dedent("""\ + ... $hessian + ... 4 + ... 0 1 + ... 0 0.473532 0.004379 + ... 1 0.004785 0.028807 + ... 2 0.004785 -0.022335 + ... 3 -0.418007 0.008333 + ... 2 3 + ... 0 0.004379 -0.416666 + ... 1 -0.022335 0.008067 + ... """) + >>> data2 = prs_outer.capture_body(text2) + >>> data2 + [[[['0.473532', '0.004379'], ['0.004785', '0.028807'], ['0.004785', '-0.022335'], ['-0.418007', '0.008333']], [['0.004379', '-0.416666'], ['-0.022335', '0.008067']]]] + >>> np.asarray(data2, dtype=float) + Traceback (most recent call last): + ... + ValueError: setting an array element with a sequence. + >>> np.column_stack([np.asarray(block, dtype=float) for block in data2[0]]) + Traceback (most recent call last): + ... + ValueError: all the input array dimensions except for the concatenation axis must match exactly + +In situations like this, the returned data structure either must be processed +with methods that can accommodate the missing data, or the missing data must be explicitly +filled in before conversion to |ndarray|. diff --git a/doc/source/tutorial/examples/optional_line.rst b/doc/source/tutorial/examples/optional_line.rst new file mode 100644 index 0000000..f29e80a --- /dev/null +++ b/doc/source/tutorial/examples/optional_line.rst @@ -0,0 +1,139 @@ +.. Demonstration of the optional-line token + +The Optional-Line Token +======================= + +In some situations, data is output in a fashion such +that a line of, e.g., header text is present in +some parts of the content of interest, but not others. +Take the following fictitious example: + +.. doctest:: main + + >>> text = dedent(""" + ... $DATA + ... ITERATION 1 + ... 0 1 2 + ... 1.5 3.1 2.4 + ... 3 4 5 + ... -0.1 2.7 -9.3 + ... ITERATION 2 + ... 0 1 2 + ... 1.6 2.9 1.8 + ... 3 4 5 + ... -0.4 2.1 -8.7 + ... """) + +This data block could be matched with triply nested |Parsers|: + +.. doctest:: main + + >>> prs_3x = pent.Parser( + ... head="@.$DATA", + ... body=pent.Parser( + ... head="@.ITERATION #..i", + ... body=pent.Parser( + ... head="#++i", + ... body="#!+.d", + ... ), + ... ), + ... ) + >>> prs_3x.capture_body(text) + [[[[['1.5', '3.1', '2.4']], [['-0.1', '2.7', '-9.3']]], [[['1.6', '2.9', '1.8']], [['-0.4', '2.1', '-8.7']]]]] + +However, that definition is quite bulky, and for more complex +patterns and larger text inputs the three layers of nesting +can sometimes lead to problematically slow parsing times. + +The :ref:`optional-line ` +pattern flag allows for a simpler |Parser| structure here: + +.. doctest:: main + + >>> prs_opt = pent.Parser( + ... head=("? @.$DATA", "@.ITERATION #..i"), + ... body=pent.Parser( + ... head="#++i", + ... body="#!+.d", + ... ), + ... ) + >>> prs_opt.capture_body(text) + [[[['1.5', '3.1', '2.4']], [['-0.1', '2.7', '-9.3']]], [[['1.6', '2.9', '1.8']], [['-0.4', '2.1', '-8.7']]]] + +The |cour|\ $DATA\ |/cour| is now wrapped into the *head* +of the outer of just *two* |Parsers|, flagged as optional so that +the |cour|\ ITERATION 2\ |/cour| can be matched. +This approach also returns the data with one fewer level of +|list| enclosure, which may be convenient in +downstream processing. + +Since in this example the lines containing integers and the +lines containing decimals are *strictly* alternating, +yet another alternative would be to include the integer 'header' +lines as a non-captured portion of the *body*: + +.. doctest:: main + + >>> prs_opt = pent.Parser( + ... head=("? @.$DATA", "@.ITERATION #..i"), + ... body=pent.Parser( + ... body=("#++i", "#!+.d"), + ... ) + ... ) + >>> prs_opt.capture_body(text) + [[[['1.5', '3.1', '2.4'], ['-0.1', '2.7', '-9.3']]], [[['1.6', '2.9', '1.8'], ['-0.4', '2.1', '-8.7']]]] + +Doing it this way results in each |cour|\ ITERATION\ |/cour|\ 's +data being grouped into a two-dimensional matrix, instead of +each individual line of decimal values occurring in its own +matrix. This may or may not be desirable, depending on the +semantics of the data being captured. + + +.. _tutorial-examples-optline-threetypes: + +The Three Cases of Optional-Line Matches +---------------------------------------- + +More generally, as noted at the +:ref:`'pattern' basic usage page `, +a pattern with the optional flag will match in three situations: + +1. When a line is present matching the optional pattern: + + .. doctest:: match_types + + >>> prs = pent.Parser(body=("@!.a", "? @!.b", "@!.c")) + >>> prs.capture_body("""a + ... b + ... c""") + [[['a', 'b', 'c']]] + +2. When a blank line is present where the optional pattern would match: + + .. doctest:: match_types + + >>> prs.capture_body("""a + ... + ... c""") + [[['a', None, 'c']]] + +3. When there is **no** line present where the optional pattern would match: + + .. doctest:: match_types + + >>> prs.capture_body("""a + ... c""") + [[['a', None, 'c']]] + +If a line is present that does not match the optional pattern, +the **entire** |Parser| will fail to match: + +.. doctest:: match_types + + >>> prs.capture_body("""a + ... foo + ... c""") + [] + + diff --git a/doc/source/tutorial/examples/post_process.rst b/doc/source/tutorial/examples/post_process.rst new file mode 100644 index 0000000..dd01689 --- /dev/null +++ b/doc/source/tutorial/examples/post_process.rst @@ -0,0 +1,13 @@ +.. Post-processing example + +\*Post-Processing of Captured Data +================================== + +Sometimes, data in text is laid out in a way such that +``pent`` can't retrieve *only* the data of interest +using a |Parser|. In these cases, post-processing +of the data obtained from |capture_body| is the +simplest approach. + +*Multiwfn LI* + diff --git a/doc/source/tutorial/examples/single_parser.rst b/doc/source/tutorial/examples/single_parser.rst new file mode 100644 index 0000000..d4fa346 --- /dev/null +++ b/doc/source/tutorial/examples/single_parser.rst @@ -0,0 +1,222 @@ +.. Capturing with a single Parser + +Capturing with a Single |Parser| +================================ + +This first example is a modified version of the dataset used in the first half of the +project `README `__, +drawn from a .hess file generated by +`ORCA `__: + +.. doctest:: orca_freqs + + >>> text = dedent("""\ + ... $vibrational_frequencies + ... 6 + ... 0 0.000000 + ... 1 0.000000 + ... 2 -194.490162 + ... 3 -198.587114 + ... 4 389.931897 + ... 5 402.713910 + ... """) + + +A Minimal |Parser| Body +----------------------- + +Focusing first on the main section of the data, the goal here is to retrieve +the floats in the right-hand column; the rest of the content is irrelevant. +However, the integers in the left-hand column still have to be represented in the +pattern, even if they're not captured. + +So, to represent those leading integers, the first token of the body pattern +needs to be a single number (|cour|\ #.\ |/cour|) that's not captured +(omit |cour|\ !\ |/cour|), with a positive sign +(|cour|\ +\ |/cour|) and integer format (|cour|\ i\ |/cour|), leading to +|cour|\ #.+i\ |/cour|. + +Then, to match the second, decimal value on each line, the second token +needs to also be a single number +(|cour|\ #.\ |/cour|) of decimal format (|cour|\ d\ |/cour|). +But, since we want these values to be captured in output, it's necessary to +insert |cour|\ !\ |/cour| after |cour|\ #\ |/cour|. And, since some of the values +in this list are negative and some are positive, the token should allow any +sign (|cour|\ .\ |/cour|). +Thus, the second token should be |cour|\ #!..d\ |/cour|. + +So, a first stab at the body of the |Parser| would be: + +.. doctest:: orca_freqs + + >>> prs = pent.Parser(body="#.+i #!..d") + >>> prs.capture_body(text) + [[['0.000000'], ['0.000000'], ['-194.490162'], ['-198.587114'], ['389.931897'], ['402.713910']]] + +Works nicely! There are two things to note about the data returned here, though: + +**First**, all of the numerical values are returned as **strings**. ``pent`` tries to +maximize flexibility by making no assumptions about what needs to be +done with the data. Thus, some post-processing will always be required. +For example, to get the captured values from `data` into a ``numpy`` array, +one could do the following: + +.. doctest:: orca_freqs + + >>> arr = np.asarray(prs.capture_body(text), dtype=float).squeeze() + >>> print(arr) + [ 0. 0. -194.490162 -198.587114 389.931897 402.71391 ] + +**Second**, the captured data is always returned as a nested series of lists. +In situations like this one, where a single |Parser| is used, +the nesting will be three levels deep. This is because each matching block +of data is returned as a matrix (a list of lists), and each of these matrices +is then in turn a member of the outermost list. + +In this particular instance, since the `body` captures exactly one value +per line of test parsed, the innermost lists are length-one. And, since +there are six lines that match the `body` pattern, the matrix +that is returned is of size 6x1 (a list containing six length-one lists). + +This means that if there had been a gap in the data, +the outermost list would have had length greater than one: + +.. doctest:: orca_freqs + + >>> text2 = dedent("""\ + ... 0 0.000000 + ... 1 0.000000 + ... + ... 2 -194.490162 + ... 3 -198.587114 + ... """) + >>> prs.capture_body(text2) + [[['0.000000'], ['0.000000']], [['-194.490162'], ['-198.587114']]] + +There are two blocks of data here, each with two rows of one value each, so +the return value from |capture_body| is a length-two list, +where each item of that list represents a 2x1 matrix. + + +.. _tutorial-examples-singleparser-multiplevalues: + +Capturing Multiple Values per Line +---------------------------------- + +If one wanted to also capture the integer indices in each row, the only +change needed would be to add the |cour|\ !\ |/cour| capturing flag to that first token: + +.. doctest:: orca_freqs + + >>> pent.Parser(body="#!.+i #!..d").capture_body(text2) + [[['0', '0.000000'], ['1', '0.000000']], [['2', '-194.490162'], ['3', '-198.587114']]] + + +Constraining the |Parser| Match with a `head` +--------------------------------------------- + +**However,** what if there are other datasets in the file that have +this same format, but that we don't want to capture: + +.. doctest:: orca_freqs + + >>> text3 = dedent("""\ + ... $vibrational_frequencies + ... 6 + ... 0 0.000000 + ... 1 0.000000 + ... 2 -194.490162 + ... 3 -198.587114 + ... 4 389.931897 + ... 5 402.713910 + ... + ... $unrelated_data + ... 3 + ... 0 3.316 + ... 1 -4.311 + ... 2 12.120 + ... """) + +The original |Parser| will grab both of these blocks of data: + +.. doctest:: orca_freqs + + >>> prs.capture_body(text3) + [[['0.000000'], ['0.000000'], ['-194.490162'], ['-198.587114'], ['389.931897'], ['402.713910']], [['3.316'], ['-4.311'], ['12.120']]] + +The |Parser| can be constrained to only the data we want by introducing a `head` +pattern: + +.. doctest:: orca_freqs + + >>> prs2 = pent.Parser( + ... head=["@.$vibrational_frequencies", "#!.+i"], + ... body="#.+i #!..d" + ... ) + >>> prs2.capture_body(text3) + [[['0.000000'], ['0.000000'], ['-194.490162'], ['-198.587114'], ['389.931897'], ['402.713910']]] + +This use of `head` introduces two concepts: (1) the 'literal string' token, |cour|\ @\ |/cour|, +in combination with the "\ |cour|\ .\ |/cour|\ " quantity marker telling the +|Parser| to match the literal string exactly once; and (2) the ``pent`` +feature wherein a length-\ *n* ordered iterable of pattern strings +(here, length-two) will match *n* lines from the data string. In this case, +the first string in the tuple matches the +"\ |cour|\ $vibrational_frequencies\ |/cour|\ " marker in the first line of the header, +and the second captures the single positive integer in the second line of the header. + + +Capturing in *head* and *tail* with |capture_struct| +----------------------------------------------------------------------- + +In the example immediately above, note that even though the "\ |cour|\ !\ |/cour|\ " +capturing flag is specified in the second element of the `head`, +that captured value does not show up in the +|capture_body| output. Captures in `head` and `tail` must +be retrieved using |capture_struct|: + +.. doctest:: orca_freqs + + >>> prs2.capture_struct(text3) + [{: [['6']], : [['0.000000'], ['0.000000'], ['-194.490162'], ['-198.587114'], ['389.931897'], ['402.713910']], : None}] + >>> prs2.capture_struct(text3)[0][pent.ParserField.Head] + [['6']] + +The return value from |capture_struct| +has length equal to the number of times the |Parser| matched +within the text. Here, since the pattern only matched once, the return +value is of length one. + +As a convenience, the lists returned by |capture_struct| +are actually of type |ThruList|, a custom subclass of |list|, +which will silently pass through indices/keys to their first argument +if and only if they are of length one. +Thus, the following would also work for `prs2` operating on `text3`: + +.. doctest:: orca_freqs + + >>> prs2.capture_struct(text3)[pent.ParserField.Head] + [['6']] + +But, it would break for the original `prs`, +where the overall pattern matched twice: + +.. doctest:: orca_freqs + + >>> prs.capture_struct(text3) + [{: None, : [['0.000000'], ['0.000000'], ['-194.490162'], ['-198.587114'], ['389.931897'], ['402.713910']], : None}, {: None, : [['3.316'], ['-4.311'], ['12.120']], : None}] + >>> prs.capture_struct(text3)[pent.ParserField.Head] + Traceback (most recent call last): + ... + pent.errors.ThruListError: Invalid ThruList index: Numeric index required for len != 1 + +As a final note, consider the difference between the `head` and `tail` results +for the below |Parser|, where `head` is defined but has no capturing tokens present +(yields ``[[]]``), but `tail` is not specified (yields ``None``): + +.. doctest:: orca_freqs + + >>> pent.Parser(head="#.+i", body="#.+i #!..d").capture_struct(text) + [{: [[]], : [['0.000000'], ['0.000000'], ['-194.490162'], ['-198.587114'], ['389.931897'], ['402.713910']], : None}] + + diff --git a/doc/source/tutorial/examples/space_after.rst b/doc/source/tutorial/examples/space_after.rst new file mode 100644 index 0000000..26136c7 --- /dev/null +++ b/doc/source/tutorial/examples/space_after.rst @@ -0,0 +1,108 @@ +.. Demonstrating use-cases for required/optional/no space after + +Required/Optional/Prohibited Trailing Whitespace +================================================ + +By default, :ref:`number ` (|cour|\ #\ |/cour|), +:ref:`misc ` (|cour|\ &\ |/cour|), +and :ref:`literal ` (|cour|\ @\ |/cour|) tokens +require trailing whitespace to be present in the text +in order to match: + +.. doctest:: space_after + + >>> text_space = dedent("""\ + ... foo: 5 + ... bar: 8 + ... """) + >>> text_nospace = dedent("""\ + ... foo:5 + ... bar:8 + ... """) + >>> prs_req = pent.Parser(body="&. #!.+i") + >>> prs_req.capture_body(text_space) + [[['5'], ['8']]] + >>> prs_req.capture_body(text_nospace) + [] + +``pent`` provides a means to make this trailing whitespace +either optional or prohibited, if needed, +via a :ref:`token-level flag `. + +Optional trailing whitespace is indicated with an +"|cour|\ o\ |/cour|" flag in the token: + +.. doctest:: space_after + + >>> prs_opt = pent.Parser(body="&o. #!.+i") + >>> prs_opt.capture_body(text_space) + [[['5'], ['8']]] + >>> prs_opt.capture_body(text_nospace) + [[['5'], ['8']]] + +Similarly, prohibited trailing whitespace is indicated with an +"|cour|\ x\ |/cour|" flag in the token: + +.. doctest:: space_after + + >>> prs_prohib = pent.Parser(body="&x. #!.+i") + >>> prs_prohib.capture_body(text_space) + [] + >>> prs_prohib.capture_body(text_nospace) + [[['5'], ['8']]] + +If used in combination with the capturing "|cour|\ !\ |/cour|" flag, +the trailing-space flag is placed *before* the capturing flag; +e.g., as "|cour|\ &x!.\ |/cour|". + +One common situation where this capability is needed +is when a number of interest is contained in prose text +and falls at the end of a sentence: + +.. doctest:: space_after + + >>> text_prose = dedent("""\ + ... pi is approximately 3.14159. + ... """) + >>> pent.Parser(body="~ #!..d &.").capture_body(text_prose) + [] + >>> pent.Parser(body="~ #x!..d &.").capture_body(text_prose) + [[['3.14159']]] + +Don't forget to include a token for that trailing period! +The |Parser| won't find a match, otherwise: + +.. doctest:: space_after + + >>> pent.Parser(body="~ #x!..d").capture_body(text_prose) + [] + + +Limitations of the "Any" Token +------------------------------ + +Note that, as currently implemented, the 'any' token +(|cour|\ ~\ |/cour|) does not allow specification of +optional or prohibited trailing whitespace; any +content that it matches *must* be followed by +whitespace for the |Parser| to work: + +.. doctest:: space_after + + >>> text_sandwich = dedent("""\ + ... This number3.14159is sandwiched in text. + ... """) + >>> pent.Parser(body="~ #x!..d ~").capture_body(text_sandwich) + [] + +In order to match this value, the preceding text must be matched +either by a literal or a misc token: + +.. doctest:: space_after + + >>> pent.Parser(body="~ @x.number #x!..d ~").capture_body(text_sandwich) + [[['3.14159']]] + >>> pent.Parser(body="~ &x. #x!..d ~").capture_body(text_sandwich) + [[['3.14159']]] + +This deficiency will be addressed in :issue:`78`. diff --git a/pent/__init__.py b/pent/__init__.py index 57b4267..125e62b 100644 --- a/pent/__init__.py +++ b/pent/__init__.py @@ -9,7 +9,7 @@ 3 Sep 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -60,4 +60,4 @@ from .utils import column_stack_2d -__version__ = "0.2rc1" +__version__ = "0.2" diff --git a/pent/enums.py b/pent/enums.py index f270792..fc30591 100644 --- a/pent/enums.py +++ b/pent/enums.py @@ -9,7 +9,7 @@ 3 Sep 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -33,14 +33,14 @@ class Number(str, Enum): #: Integer value; no decimal or scientific/exponential notation Integer = "i" - #: Floating-point value; no scientific/exponential notation - Float = "f" + #: Decimal floating-point value; no scientific/exponential notation + Decimal = "d" #: Scientific/exponential notation, where exponent is *required* SciNot = "s" - #: "Decimal" value; floating-point value with or without an exponent - Decimal = "d" + #: "Floating-point value with or without an exponent + Float = "f" #: "General" value; integer, float, or scientific notation General = "g" @@ -147,7 +147,3 @@ class SpaceAfter(str, Enum): #: Following space prohibited Prohibited = "x" - - -if __name__ == "__main__": # pragma: no cover - print("Module not executable.") diff --git a/pent/errors.py b/pent/errors.py index 8f607bc..17281a3 100644 --- a/pent/errors.py +++ b/pent/errors.py @@ -9,7 +9,7 @@ 10 Sep 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -26,7 +26,7 @@ class PentError(Exception): # pragma: no cover - """Superclass for all custom |pent| errors.""" + """Superclass for all custom pent errors.""" pass @@ -77,7 +77,3 @@ def __init__(self, msg=""): def __str__(self): """Generate a more-informative error message.""" return "Invalid ThruList index: {}".format(self.msg) - - -if __name__ == "__main__": # pragma: no cover - print("Module not executable.") diff --git a/pent/parser.py b/pent/parser.py index d1aa0e0..e7f2339 100644 --- a/pent/parser.py +++ b/pent/parser.py @@ -9,7 +9,7 @@ 8 Sep 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -76,7 +76,7 @@ def all_optional(sec): if res_head: rx += ( ( - "(?P<{}>".format(ParserField.Head) + res_head + r")\n?" + "(?P<{}>".format(ParserField.Head) + res_head + ")" if capture_sections else "(" + res_head + ")" ) @@ -158,13 +158,24 @@ def capture_str_pattern(cls, pat_str, text): # This is unreachable at the moment raise SectionError("Invalid pattern string for capture") - if text is None: - # An all-optional section was entirely absent - return [[None]] + if text is None: # pragma: no cover + # The changes implemented for optional-line handling (#89) + # appear to have made this irrelevant. However, + # it was included at one time, so keep the trap + raise RuntimeError("'text' was unexpectedly None") data = [] for m in re.finditer(pat_re, text): chunk_caps = [] + + # Do not want to capture anything from a zero-length + # match; this leads to spurious [None] returns when + # the optional-line flag is used, as the entirely-optional + # nature of that regex will match a zero-length segment + # at (if nothing else) the end of the matched portion. + if len(m.group(0)) == 0: + continue + for c in cls.generate_captures(m): if c is None: chunk_caps.append(None) @@ -312,12 +323,20 @@ def convert_line(cls, line, *, capture_groups=True, group_id=0): # Always put possible whitespace to the end of the line. # Also include a format tag for closing optional-line grouping - pattern += r"[ \t]*{opline_close}($|(?=\n))" + pattern += r"[ \t]*{opline_close}" + + # Per #89, this lookahead must also be optional for an + # optional line + pattern += "($|(?=\n))" + ("?" if optional_line else "") # Wrap pattern with parens and '?' if it's optional - # Otherwise just drop the formatting tags + # Otherwise just drop the formatting tags. + # + # The leading question mark in the opline_open + # substitution is to make the SOL/SOF lookbehind + # optional in the case of an optional line, per #89. pattern = pattern.format( - opline_open=("(" if optional_line else ""), + opline_open=("?(" if optional_line else ""), opline_close=(")?" if optional_line else ""), ) @@ -336,7 +355,3 @@ def __attrs_post_init__(self): """Perform instantiation-time stuff.""" # Check pattern viability *now* self.pattern(capture_sections=False) - - -if __name__ == "__main__": # pragma: no cover - print("Module not executable.") diff --git a/pent/patterns.py b/pent/patterns.py index 03f774c..c8fe931 100644 --- a/pent/patterns.py +++ b/pent/patterns.py @@ -9,7 +9,7 @@ 2 Sep 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -76,23 +76,25 @@ def std_wordify_close(p): _p_intnums = r"\d+" -_p_floatnums = r"(\d+\.\d*|\d*\.\d+)" +_p_decimalnums = r"(\d+\.\d*|\d*\.\d+)" _p_scinums = r"(\d+\.?\d*[{0}][+-]?\d+|\d*\.\d+[{0}][+-]?\d+)".format( std_scinot_markers ) -_p_decimalnums = r"({0}|{1})".format(_p_floatnums, _p_scinums) +_p_floatnums = r"({0}|{1})".format(_p_decimalnums, _p_scinums) -_p_generalnums = r"({0}|{1}|{2})".format(_p_floatnums, _p_scinums, _p_intnums) +_p_generalnums = r"({0}|{1}|{2})".format( + _p_decimalnums, _p_scinums, _p_intnums +) _p_nums = { Number.Integer: _p_intnums, - Number.Float: _p_floatnums, - Number.SciNot: _p_scinums, Number.Decimal: _p_decimalnums, + Number.SciNot: _p_scinums, + Number.Float: _p_floatnums, Number.General: _p_generalnums, } @@ -103,7 +105,3 @@ def std_wordify_close(p): for (n, s) in itt.product(Number, Sign): number_patterns.update({(n, s): _p_signs[s] + _p_nums[n]}) - - -if __name__ == "__main__": # pragma: no cover - print("Module not executable.") diff --git a/pent/test/__init__.py b/pent/test/__init__.py index 6f82510..42dec27 100644 --- a/pent/test/__init__.py +++ b/pent/test/__init__.py @@ -9,7 +9,7 @@ 2 Sep 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent diff --git a/pent/test/pent_base.py b/pent/test/pent_base.py index 693e873..f780374 100644 --- a/pent/test/pent_base.py +++ b/pent/test/pent_base.py @@ -9,7 +9,7 @@ 3 Sep 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -34,6 +34,7 @@ from textwrap import dedent import unittest as ut +import pent from pent import ParserField from pent.errors import LineError from pent.thrulist import ThruList @@ -951,6 +952,266 @@ def test_parsers_in_head_and_tail(self): self.assertEqual(result, expect) +class TestPentParserOptlinePatterns(ut.TestCase, SuperPent): + """Confirming matching of optional-line patterns.""" + + def test_optional_1line_head(self): + """Confirm parsers with one optional line in head.""" + prs = pent.Parser(head="? @!.foo", body="#!+.i") + + # Many of these match because the "header" line is just + # ignored by the optional line of the head. + # NEED TO ROBUSTIFY WITH A CHECK ON THE HEAD CAPTURES! + good_texts = [ + ("1 2 3", []), + ("1 2 3\n4 5 6", []), + ("foo\n1 2 3", [["foo"]]), + ("\n1 2 3", []), + ("foo\n\n\n1 2 3", []), + ("foobar\n1 2 3", []), + ] + for i, tup in enumerate(good_texts): + with self.subTest("good_{}".format(i)): + s = prs.capture_struct(tup[0]) + self.assertNotEqual([], s) + self.assertEqual(s[pent.ParserField.Head], tup[1]) + + def test_optional_2line_head(self): + """Confirm parsers with two optional lines in head.""" + prs = pent.Parser(head=("? @!.foo", "? @!.bar"), body="#!+.i") + + # Again, a couple of these match because the whole header + # is just ignored. + good_texts = [ + ("1 2 3", []), + ("1 2 3\n4 5", []), + ("foo\n1 2 3", [["foo", None]]), + ("bar\n1 2 3", [[None, "bar"]]), + ("foobar\n1 2 3", [[None, None]]), + ("foo\n\n\n1 2 3", [[None, None]]), + ("foo\nbar\n1 2 3", [["foo", "bar"]]), + ("foo\n\n1 2 3", [["foo", None]]), + ] + + for i, tup in enumerate(good_texts): + with self.subTest("good_{}".format(i)): + s = prs.capture_struct(tup[0]) + self.assertNotEqual([], s) + self.assertEqual(s[pent.ParserField.Head], tup[1]) + + def test_optional_lastline_head(self): + """Confirm parsers with one required & one optional line in head.""" + prs = pent.Parser(head=("@!.foo", "? @!.bar"), body="#!+.i") + + good_texts = [ + ("foo\n1 2 3", [["foo", None]]), + ("foo\n\n1 2 3", [["foo", None]]), + ("foo\nbar\n1 2 3", [["foo", "bar"]]), + ] + + for i, tup in enumerate(good_texts): + with self.subTest("good_{}".format(i)): + s = prs.capture_struct(tup[0]) + self.assertNotEqual([], s) + self.assertEqual(s[pent.ParserField.Head], tup[1]) + + bad_texts = ["foo\n\n\n1 2 3", "foo\nbaz\n1 2 3", "foo\nbar\n\n1 2 3"] + + for i, text in enumerate(bad_texts): + with self.subTest("bad_{}".format(i)): + s = prs.capture_struct(text) + self.assertEqual([], s) + + def test_optional_firstline_head(self): + """Confirm parsers with one optional & one required line in head.""" + prs = pent.Parser(head=("? @!.foo", "@!.bar"), body="#!+.i") + + good_texts = [ + ("bar\n1 2 3", [[None, "bar"]]), + ("foo\nbar\n1 2 3", [["foo", "bar"]]), + ("quuz\nbar\n1 2 3", [[None, "bar"]]), + ] + + for i, tup in enumerate(good_texts): + with self.subTest("good_{}".format(i)): + s = prs.capture_struct(tup[0]) + self.assertNotEqual([], s) + self.assertEqual(s[pent.ParserField.Head], tup[1]) + + bad_texts = [ + "foo\n1 2 3", + "foo\nbaz\n1 2 3", + "bar\n\n1 2 3", + "foo\nbar\n\n1 2 3", + ] + + for i, text in enumerate(bad_texts): + with self.subTest("bad_{}".format(i)): + s = prs.capture_struct(text) + self.assertEqual([], s) + + def test_optional_body_line(self): + """Confirm optional line works inside body.""" + prs = pent.Parser( + head="@.foo", body=("#!+.i", "? #!+.d"), tail="@.bar" + ) + + good_texts = [ + dedent( + """ + foo + 1 2 3 + 1. 2. 3. + 4 5 6 + 4. 5. 6. + bar + """ + ), + dedent( + """ + foo + 1 2 3 + 4 5 6 + 7 8 9 + 7. 8. 9. + 1 2 3 + bar + """ + ), + ] + + for i, text in enumerate(good_texts): + with self.subTest("good_{}".format(i)): + s = prs.capture_struct(text) + self.assertNotEqual([], s) + + def test_optional_1line_tail(self): + """Confirm parsers with one optional line in tail.""" + prs = pent.Parser(tail="? @!.foo", body="#!+.i") + + # Many of these match because the "footer" line is just + # ignored by the optional line of the tail. + good_texts = [ + ("1 2 3", []), + ("1 2 3\n4 5 6", []), + ("1 2 3\nfoo", [["foo"]]), + ("\n1 2 3\n", []), + ("1 2 3\n\nfoo", []), + ("1 2 3\nfoobar", []), + ] + for i, tup in enumerate(good_texts): + with self.subTest("good_{}".format(i)): + s = prs.capture_struct(tup[0]) + self.assertNotEqual([], s) + self.assertEqual(s[pent.ParserField.Tail], tup[1]) + + def test_optional_2line_tail(self): + """Confirm parsers with two optional lines in tail.""" + prs = pent.Parser(tail=("? @!.foo", "? @!.bar"), body="#!+.i") + + # Again, a couple of these match because the whole footer + # is just ignored. + good_texts = [ + ("1 2 3", []), + ("1 2 3\n4 5", []), + ("1 2 3\nfoo", [["foo", None]]), + ("1 2 3\nbar", [[None, "bar"]]), + ("1 2 3\nfoobar\n", []), + ("1 2 3\n\nfoo", [[None, None]]), + ("1 2 3\nfoo\nbar", [["foo", "bar"]]), + ("1 2 3\nfoo\n\n", [["foo", None]]), + ] + + for i, tup in enumerate(good_texts): + with self.subTest("good_{}".format(i)): + s = prs.capture_struct(tup[0]) + self.assertNotEqual([], s) + self.assertEqual(s[pent.ParserField.Tail], tup[1]) + + def test_optional_firstline_tail(self): + """Confirm parsers with one optional & one required line in tail.""" + prs = pent.Parser(tail=("? @!.foo", "@!.bar"), body="#!+.i") + + good_texts = [ + ("1 2 3\nbar", [[None, "bar"]]), + ("1 2 3\n\nbar", [[None, "bar"]]), + ("1 2 3\nfoo\nbar", [["foo", "bar"]]), + ] + + for i, tup in enumerate(good_texts): + with self.subTest("good_{}".format(i)): + s = prs.capture_struct(tup[0]) + self.assertNotEqual([], s) + self.assertEqual(s[pent.ParserField.Tail], tup[1]) + + bad_texts = ["1 2 3\n\n\nbar", "1 2 3\nfar\nbar", "1 2 3\n\nfoo\nbar"] + + for i, text in enumerate(bad_texts): + with self.subTest("bad_{}".format(i)): + s = prs.capture_struct(text) + self.assertEqual([], s) + + def test_optional_lastline_tail(self): + """Confirm parsers with one required & one optional line in tail.""" + prs = pent.Parser(tail=("@!.foo", "? @!.bar"), body="#!+.i") + + good_texts = [ + ("1 2 3\nfoo", [["foo", None]]), + ("1 2 3\nfoo\nbar", [["foo", "bar"]]), + ("1 2 3\nfoo\nquux", [["foo", None]]), + ] + + for i, tup in enumerate(good_texts): + with self.subTest("good_{}".format(i)): + s = prs.capture_struct(tup[0]) + self.assertNotEqual([], s) + self.assertEqual(s[pent.ParserField.Tail], tup[1]) + + bad_texts = [ + "1 2 3\nbar", + "1 2 3\nfar\nbar", + "1 2 3\nbar", + "1 2 3\n\nfoo\nbar", + ] + + for i, text in enumerate(bad_texts): + with self.subTest("bad_{}".format(i)): + s = prs.capture_struct(text) + self.assertEqual([], s) + + def test_absent_completely_optional_parser(self): + """Confirm match when an all-optional Parser section is absent.""" + prs = pent.Parser( + head=pent.Parser(body="? &!. &!."), + body=pent.Parser(head="&!.", body="#!..i #!..i #!..i"), + ) + + good_texts = [ + dedent( + """ + a b + c + 1 2 3 + d + 4 5 6 + """ + ), + dedent( + """ + c + 1 2 3 + d + 4 5 6 + """ + ), + ] + + for i, text in enumerate(good_texts): + with self.subTest("good_{}".format(i)): + s = prs.capture_struct(text) + self.assertNotEqual([], s) + + def suite_base(): """Create and return the test suite for base tests.""" s = ut.TestSuite() @@ -959,12 +1220,9 @@ def suite_base(): [ tl.loadTestsFromTestCase(TestPentCorePatterns), tl.loadTestsFromTestCase(TestPentParserPatterns), + tl.loadTestsFromTestCase(TestPentParserOptlinePatterns), tl.loadTestsFromTestCase(TestPentTokens), tl.loadTestsFromTestCase(TestPentThruList), ] ) return s - - -if __name__ == "__main__": - print("Module not executable.") diff --git a/pent/test/pent_livedata.py b/pent/test/pent_livedata.py index 5f39799..830c1c5 100644 --- a/pent/test/pent_livedata.py +++ b/pent/test/pent_livedata.py @@ -9,7 +9,7 @@ 9 Oct 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -284,14 +284,13 @@ def test_orca_opt_trajectory(self): self.assertEqual(res, orca_opt_trajectory) - @ut.expectedFailure def test_ORCA_opt_progress_results_optline(self): """Confirm parse of optimization results block using optline.""" import pent data = self.get_orca_opt_file() - from .testdata import orca_opt_status + from .testdata import orca_opt_status_optline prs = pent.Parser( body=( @@ -310,7 +309,7 @@ def test_ORCA_opt_progress_results_optline(self): ) ) - self.assertEqual(prs.capture_body(data), orca_opt_status) + self.assertEqual(prs.capture_body(data), orca_opt_status_optline) class TestPentMultiwfnLiveData(ut.TestCase, SuperPent): @@ -540,7 +539,3 @@ def suite_live_gamess(): tl = ut.TestLoader() s.addTests([tl.loadTestsFromTestCase(TestPentGAMESSLiveData)]) return s - - -if __name__ == "__main__": - print("Module not executable.") diff --git a/pent/test/pent_readme.py b/pent/test/pent_readme.py index 34c4570..c53b8a0 100644 --- a/pent/test/pent_readme.py +++ b/pent/test/pent_readme.py @@ -9,7 +9,7 @@ 25 Oct 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -56,7 +56,3 @@ def suite_doctest_readme(): s.addTests([TestPentReadme]) return s - - -if __name__ == "__main__": - print("Module not executable.") diff --git a/pent/test/pent_slow.py b/pent/test/pent_slow.py index fa9592d..45ae5f6 100644 --- a/pent/test/pent_slow.py +++ b/pent/test/pent_slow.py @@ -9,7 +9,7 @@ 9 Oct 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -120,7 +120,3 @@ def suite_base_slow(): tl = ut.TestLoader() s.addTests([tl.loadTestsFromTestCase(TestPentParserPatternsSlow)]) return s - - -if __name__ == "__main__": - print("Module not executable.") diff --git a/pent/test/testdata.py b/pent/test/testdata.py index d91b466..d29c87f 100644 --- a/pent/test/testdata.py +++ b/pent/test/testdata.py @@ -9,7 +9,7 @@ 3 Sep 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -148,12 +148,12 @@ opt_1line_tail_expect_struct = [ - [[["FOOT"]], [[None]]], + [[["FOOT"]], []], [[["FOOT"]], [["FOOT"]]], - [[[None]], [[None]]], - [[["FOOT"]], [["FOOT"]], [[None]]], - [[[None]], [["FOOT"]], [[None]]], - [[[None]], [["FOOT"]], [[None]], [[None]], [[None]], [[None]]], + [[], []], + [[["FOOT"]], [["FOOT"]], []], + [[], [["FOOT"]], []], + [[], [["FOOT"]], [], [], [], []], ] @@ -370,6 +370,7 @@ ], ] + orca_opt_status_optline = [ [ [ @@ -10894,15 +10895,15 @@ (Number.Integer, Sign.Positive): True, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): True, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): False, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -10911,15 +10912,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): True, (Number.Integer, Sign.Any): True, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): False, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, (Number.General, Sign.Positive): False, (Number.General, Sign.Negative): True, (Number.General, Sign.Any): True, @@ -10928,15 +10929,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): True, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -10945,15 +10946,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): True, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): False, (Number.Decimal, Sign.Negative): True, (Number.Decimal, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): False, (Number.General, Sign.Negative): True, (Number.General, Sign.Any): True, @@ -10962,15 +10963,15 @@ (Number.Integer, Sign.Positive): True, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): True, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): False, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -10979,15 +10980,15 @@ (Number.Integer, Sign.Positive): True, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): True, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): False, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -10996,15 +10997,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): True, (Number.Integer, Sign.Any): True, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): False, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, (Number.General, Sign.Positive): False, (Number.General, Sign.Negative): True, (Number.General, Sign.Any): True, @@ -11013,15 +11014,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): True, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11030,15 +11031,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): True, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11047,15 +11048,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): True, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11064,15 +11065,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): True, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11081,15 +11082,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): True, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): True, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11098,15 +11099,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): True, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): False, (Number.Decimal, Sign.Negative): True, (Number.Decimal, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): False, (Number.General, Sign.Negative): True, (Number.General, Sign.Any): True, @@ -11115,15 +11116,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): True, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): False, (Number.Decimal, Sign.Negative): True, (Number.Decimal, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): False, (Number.General, Sign.Negative): True, (Number.General, Sign.Any): True, @@ -11132,15 +11133,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): True, - (Number.Float, Sign.Any): True, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): False, (Number.Decimal, Sign.Negative): True, (Number.Decimal, Sign.Any): True, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): False, (Number.General, Sign.Negative): True, (Number.General, Sign.Any): True, @@ -11149,15 +11150,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, (Number.SciNot, Sign.Positive): True, (Number.SciNot, Sign.Negative): False, (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11166,15 +11167,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, (Number.SciNot, Sign.Positive): True, (Number.SciNot, Sign.Negative): False, (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11183,15 +11184,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, (Number.SciNot, Sign.Positive): True, (Number.SciNot, Sign.Negative): False, (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11200,15 +11201,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, (Number.SciNot, Sign.Positive): True, (Number.SciNot, Sign.Negative): False, (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11217,15 +11218,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, (Number.SciNot, Sign.Positive): True, (Number.SciNot, Sign.Negative): False, (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11234,15 +11235,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, (Number.SciNot, Sign.Positive): True, (Number.SciNot, Sign.Negative): False, (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11251,15 +11252,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, (Number.SciNot, Sign.Positive): True, (Number.SciNot, Sign.Negative): False, (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11268,15 +11269,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, (Number.SciNot, Sign.Positive): True, (Number.SciNot, Sign.Negative): False, (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): True, - (Number.Decimal, Sign.Negative): False, - (Number.Decimal, Sign.Any): True, + (Number.Float, Sign.Positive): True, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): True, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): True, @@ -11285,15 +11286,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, + (Number.Decimal, Sign.Positive): False, + (Number.Decimal, Sign.Negative): False, + (Number.Decimal, Sign.Any): False, (Number.SciNot, Sign.Positive): False, (Number.SciNot, Sign.Negative): True, (Number.SciNot, Sign.Any): True, - (Number.Decimal, Sign.Positive): False, - (Number.Decimal, Sign.Negative): True, - (Number.Decimal, Sign.Any): True, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): True, + (Number.Float, Sign.Any): True, (Number.General, Sign.Positive): False, (Number.General, Sign.Negative): True, (Number.General, Sign.Any): True, @@ -11302,15 +11303,15 @@ (Number.Integer, Sign.Positive): False, (Number.Integer, Sign.Negative): False, (Number.Integer, Sign.Any): False, - (Number.Float, Sign.Positive): False, - (Number.Float, Sign.Negative): False, - (Number.Float, Sign.Any): False, - (Number.SciNot, Sign.Positive): False, - (Number.SciNot, Sign.Negative): False, - (Number.SciNot, Sign.Any): False, (Number.Decimal, Sign.Positive): False, (Number.Decimal, Sign.Negative): False, (Number.Decimal, Sign.Any): False, + (Number.SciNot, Sign.Positive): False, + (Number.SciNot, Sign.Negative): False, + (Number.SciNot, Sign.Any): False, + (Number.Float, Sign.Positive): False, + (Number.Float, Sign.Negative): False, + (Number.Float, Sign.Any): False, (Number.General, Sign.Positive): False, (Number.General, Sign.Negative): False, (Number.General, Sign.Any): False, diff --git a/pent/thrulist.py b/pent/thrulist.py index 8c353a3..bbf82ae 100644 --- a/pent/thrulist.py +++ b/pent/thrulist.py @@ -9,7 +9,7 @@ 3 Oct 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -44,7 +44,3 @@ def __getitem__(self, key): ) else: raise ThruListError(msg="Numeric index required for len != 1") - - -if __name__ == "__main__": # pragma: no cover - print("Module not executable.") diff --git a/pent/token.py b/pent/token.py index 4ec10a0..531159e 100644 --- a/pent/token.py +++ b/pent/token.py @@ -9,7 +9,7 @@ 20 Sep 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent @@ -182,7 +182,7 @@ def match_quantity(self): """Return match quantity. |None| for :attr:`pent.enums.Content.Any` or - :attr:`pent.enums.Content.OptionalLine + :attr:`pent.enums.Content.OptionalLine` """ if self.is_any or self.is_optional_line: @@ -326,7 +326,3 @@ def _selective_group_enclose(self, pat): return (self._group_open() + pat + self._group_close(), True) else: return pat, False - - -if __name__ == "__main__": # pragma: no cover - print("Module not executable.") diff --git a/pent/utils.py b/pent/utils.py index 9dd7e84..e495104 100644 --- a/pent/utils.py +++ b/pent/utils.py @@ -9,7 +9,7 @@ 14 Oct 2018 **Copyright** - \(c) Brian Skinn 2018 + \(c) Brian Skinn 2018-2019 **Source Repository** http://www.github.com/bskinn/pent diff --git a/requirements-dev.txt b/requirements-dev.txt index d41681d..cfd2194 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,14 +1,13 @@ -attrs<18 +attrs numpy -sphinx==1.7.8 -sphinx_rtd_theme==0.4.1 +sphinx>=2,<3 +sphinx_rtd_theme sphinx-issues -ipython +sphinx-autobuild flake8 flake8-docstrings coverage tox -wget pyparsing black restview diff --git a/requirements-rtd.txt b/requirements-rtd.txt index e614801..119920f 100644 --- a/requirements-rtd.txt +++ b/requirements-rtd.txt @@ -1,5 +1,5 @@ -attrs>=17.1,<18 +attrs pyparsing -sphinx==1.7.6 +sphinx>=2,<3 sphinx-issues -sphinx-rtd-theme==0.4.1 +sphinx-rtd-theme diff --git a/requirements-termux.txt b/requirements-termux.txt deleted file mode 100644 index 58f5d01..0000000 --- a/requirements-termux.txt +++ /dev/null @@ -1,15 +0,0 @@ -attrs<18 -#sphinx==1.7.8 -#sphinx_rtd_theme==0.4.1 -#sphinx-issues -ipython -flake8 -flake8-docstrings -coverage -#tox -#wget -pyparsing -#black -#restview -#twine - diff --git a/requirements-travis.txt b/requirements-travis.txt index aa71c86..60a7c77 100644 --- a/requirements-travis.txt +++ b/requirements-travis.txt @@ -1,5 +1,5 @@ -attrs<18 -numpy==1.15 +attrs +numpy pyparsing coverage codecov diff --git a/setup.py b/setup.py index 82f6235..1d14169 100644 --- a/setup.py +++ b/setup.py @@ -4,10 +4,11 @@ from pent import __version__ -NAME='pent' +NAME = "pent" + def readme(): - with open('README.rst', 'r') as f: + with open("README.rst", "r") as f: content = f.read() # Helper function @@ -18,14 +19,16 @@ def content_update(content, pattern, sub): # This one gets the badge image content = content_update( content, - r'(?<=/readthedocs/{0}/)\S+?(?=\.svg$)'.format(NAME), - 'v' + __version__) + r"(?<=/readthedocs/{0}/)\S+?(?=\.svg$)".format(NAME), + "v" + __version__, + ) # This one gets the RtD links content = content_update( content, - r'(?<={0}\.readthedocs\.io/en/)\S+?(?=[/>])'.format(NAME), - 'v' + __version__) + r"(?<={0}\.readthedocs\.io/en/)\S+?(?=[/>])".format(NAME), + "v" + __version__, + ) return content @@ -35,6 +38,7 @@ def content_update(content, pattern, sub): version=__version__, description="pent Extracts Numerical Text", long_description=readme(), + long_description_content_type="text/x-rst", url="https://www.github.com/bskinn/pent", license="MIT License", author="Brian Skinn", @@ -51,13 +55,13 @@ def content_update(content, pattern, sub): "Operating System :: OS Independent", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Utilities", - "Development Status :: 3 - Alpha", + "Development Status :: 4 - Beta", ], ) diff --git a/tox.ini b/tox.ini index 2b48f8b..4058736 100644 --- a/tox.ini +++ b/tox.ini @@ -2,27 +2,30 @@ minversion=2.0 envlist= py36-attrs_17_4-pp_{1_5_5,2_0_0,2_2_0,latest} - py36-attrs_{17_1,17_4,latest}-pp_2_2_0 - py3{4,5,6,7}-attrs_17_4-pp_2_2_0 - py3{4,6}-attrs_17_1-pp_1_5_5 + py36-attrs_{17_1,17_3,17_4,18_1,18_2,19_1,19_2,latest}-pp_2_2_0 + py3{5,6,7,8}-attrs_19_1-pp_2_2_0 + py3{6,7,8}-attrs_17_1-pp_1_5_5 [testenv] -whitelist_externals=/bin/sh commands= python --version -# sh -c 'cd doc; make html' python tests.py -a -# py3{5,6,7}: sh -c 'cd doc; make doctest' deps= attrs_17_1: attrs==17.1 + attrs_17_2: attrs==17.2 attrs_17_3: attrs==17.3 attrs_17_4: attrs==17.4 + attrs_18_1: attrs==18.1 + attrs_18_2: attrs==18.2 + attrs_19_1: attrs==19.1 + attrs_19_2: attrs==19.2 attrs_latest: attrs pp_2_2_0: pyparsing==2.2.0 pp_2_0_0: pyparsing==2.0.0 pp_1_5_5: pyparsing==1.5.5 pp_latest: pyparsing + numpy sphinx sphinx-issues sphinx-rtd-theme @@ -37,8 +40,8 @@ basepython= [testenv:linux] platform=linux basepython= + py38: python3.8 py37: python3.7 py36: python3.6 py35: python3.5 - py34: python3.4