diff --git a/delb/utils.py b/delb/utils.py index fa335bf7..f6b111dc 100644 --- a/delb/utils.py +++ b/delb/utils.py @@ -1,30 +1,131 @@ +import enum +from itertools import zip_longest +from typing import Optional + from _delb.nodes import NodeBase, TagNode from _delb.utils import * # noqa from _delb.utils import __all__ -def compare_trees(a: NodeBase, b: NodeBase) -> bool: +# TODO increase test coverage + + +class TreeDifferenceKind(enum.Enum): + None_ = enum.auto() + NodeContent = enum.auto() + NodeType = enum.auto() + TagAttributes = enum.auto() + TagChildrenSize = enum.auto() + TagLocalName = enum.auto() + TagNamespace = enum.auto() + + +class TreesComparisonResult: """ - Compares two node trees for equality. While node types that can't have descendants - are comparable with a comparison expression, the :class:`TagNode` type deliberately - doesn't implement the ``==`` operator, because it isn't clear whether a comparison - should also consider the node's descendants as this function does. + Instances of this class describe one or no difference between two trees. + Casting an instance to :class:`bool` will yield :obj:`True` when it describes no + difference, thus the compared trees were equal. + Casted to strings they're intended to support debugging. """ - if not isinstance(a, TagNode): - return a == b - - if ( - not isinstance(b, TagNode) - or a.namespace != b.namespace - or a.local_name != b.local_name - or a.attributes != b.attributes - or len(a) != len(b) + + def __init__( + self, + difference_kind: TreeDifferenceKind, + lhn: Optional[NodeBase], + rhn: Optional[NodeBase], ): - return False + self.difference_kind = difference_kind + self.lhn: Optional[NodeBase] = lhn + self.rhn: Optional[NodeBase] = rhn + + def __bool__(self): + return self.difference_kind is TreeDifferenceKind.None_ + + def __str__(self): + difference_kind = self.difference_kind + + if difference_kind is TreeDifferenceKind.None_: + return "Trees are equal." + + parent = self.lhn.parent + if parent is None: + parent_msg_tail = ":" + else: + parent_msg_tail = f", parent node has location_path {parent.location_path}:" + + if difference_kind is TreeDifferenceKind.NodeContent: + return f"Nodes' content differ{parent_msg_tail}\n{self.lhn!r}\n{self.rhn!r}" + elif difference_kind is TreeDifferenceKind.NodeType: + return ( + f"Nodes are of different type{parent_msg_tail} " + f"{self.lhn.__class__} != {self.rhn.__class__}" + ) + + assert isinstance(self.lhn, TagNode) + assert isinstance(self.rhn, TagNode) + + if difference_kind is TreeDifferenceKind.TagAttributes: + return ( + f"Attributes of tag nodes at {self.lhn.location_path} differ:\n" + f"{self.lhn.attributes}\n{self.rhn.attributes}" + ) + elif difference_kind is TreeDifferenceKind.TagChildrenSize: + result = f"Child nodes of tag nodes at {self.lhn.location_path} differ:" + for a, b in zip_longest( + self.lhn.iterate_children(), self.rhn.iterate_children(), fillvalue=None + ): + result += f"\n\n{a!r}\n{b!r}" + return result + elif difference_kind is TreeDifferenceKind.TagLocalName: + return ( + f"Local names of tag nodes at {self.lhn.location_path} differ: " + f"{self.lhn.local_name} != {self.rhn.location_path}" + ) + elif difference_kind is TreeDifferenceKind.TagNamespace: + return ( + f"Namespaces of tag nodes at {self.lhn.location_path} differ: " + f"{self.lhn.namespace} != {self.rhn.namespace}" + ) + + +def compare_trees(lhr: NodeBase, rhr: NodeBase) -> TreesComparisonResult: + """ + Compares two node trees for equality. Upon the first detection of a difference of + nodes that are located at the same position within the compared (sub-)trees a + mismatch is reported. + + :param lhr: The node that is considered as root of the left hand operand. + :param rhr: The node that is considered as root of the right hand operand. + :return: An object that contains information about the first or no difference. + + While node types that can't have descendants are comparable with a comparison + expression, the :class:`TagNode` type deliberately doesn't implement the ``==`` + operator, because it isn't clear whether a comparison should also consider the + node's descendants as this function does. + """ + if not isinstance(rhr, lhr.__class__): + return TreesComparisonResult(TreeDifferenceKind.NodeType, lhr, rhr) + + if isinstance(lhr, TagNode): + assert isinstance(rhr, TagNode) + if lhr.namespace != rhr.namespace: + return TreesComparisonResult(TreeDifferenceKind.TagNamespace, lhr, rhr) + if lhr.local_name != rhr.local_name: + return TreesComparisonResult(TreeDifferenceKind.TagLocalName, lhr, rhr) + if lhr.attributes != rhr.attributes: + return TreesComparisonResult(TreeDifferenceKind.TagAttributes, lhr, rhr) + if len(lhr) != len(rhr): + return TreesComparisonResult(TreeDifferenceKind.TagChildrenSize, lhr, rhr) + + for lhn, rhn in zip(lhr.iterate_children(), rhr.iterate_children()): + result = compare_trees(lhn, rhn) + if not result: + return result + + elif lhr != rhr: + return TreesComparisonResult(TreeDifferenceKind.NodeContent, lhr, rhr) - return all( - compare_trees(x, y) for x, y in zip(a.iterate_children(), b.iterate_children()) - ) + return TreesComparisonResult(TreeDifferenceKind.None_, None, None) __all__ = __all__ + (compare_trees.__name__,) diff --git a/docs/api/utilities.rst b/docs/api/utilities.rst index 964827cd..761ebae1 100644 --- a/docs/api/utilities.rst +++ b/docs/api/utilities.rst @@ -3,6 +3,10 @@ Utilities ========= +.. autofunction:: delb.compare_trees + +.. autoclass:: delb.utils.TreesComparisonResult + .. autofunction:: delb.first .. autofunction:: delb.get_traverser diff --git a/pyproject.toml b/pyproject.toml index 013c052c..092d368c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -117,7 +117,6 @@ features = ["https-loader"] [tool.hatch.envs.benchmarks] dependencies = [ "pytest-benchmark", - "xmldiff" ] [tool.hatch.envs.benchmarks.scripts] run = """ @@ -182,7 +181,6 @@ check = "mypy _delb delb" dependencies = [ "pytest-cov", "pytest-httpx", - "xmldiff" ] [tool.hatch.envs.unit-tests.scripts] check = "python -m pytest --cov=_delb --cov=delb tests" diff --git a/tests/test_serialization.py b/tests/test_serialization.py index fb40c03a..8455e426 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -14,7 +14,7 @@ ) from _delb.nodes import DETACHED -from tests.utils import assert_documents_are_semantical_equal +from tests.utils import assert_equal_trees @pytest.mark.parametrize( @@ -193,7 +193,7 @@ def test_transparency(files_path, result_file): origin = Document(file, parser_options=parser_options) origin.save(result_file) _copy = Document(file, parser_options=parser_options) - assert_documents_are_semantical_equal(file, result_file) + assert_equal_trees(origin.root, _copy.root) assert origin.head_nodes == _copy.head_nodes assert origin.tail_nodes == _copy.tail_nodes diff --git a/tests/utils.py b/tests/utils.py index 3534e945..5bbc659f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,15 +1,16 @@ import re import sys -from xmldiff import main as xmldiff - -from delb import altered_default_filters +from delb import ( + altered_default_filters, + compare_trees, + NodeBase, +) find_processing_instructions = re.compile(r"(<\?\w\S*?(\s.*?)?\?>)").findall -# REMOVE when support for Python 3.9 is dropped -if sys.version_info < (3, 10): +if sys.version_info < (3, 10): # DROPWITH Python 3.9 from itertools import tee def pairwise(iterable): @@ -21,11 +22,11 @@ def pairwise(iterable): from itertools import pairwise -def assert_documents_are_semantical_equal(old, new): - changes = xmldiff.diff_files( - str(old), str(new), diff_options={"F": 1.0, "ratio_mode": "accurate"} - ) - assert not changes, changes +@altered_default_filters() +def assert_equal_trees(a: NodeBase, b: NodeBase): + result = compare_trees(a, b) + if not result: + raise AssertionError(str(result)) def assert_nodes_are_in_document_order(*nodes):