Skip to content

Commit d9cb61c

Browse files
committed
adding BaseOperatorPlus
1 parent 8acd3bf commit d9cb61c

File tree

8 files changed

+460
-92
lines changed

8 files changed

+460
-92
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Please check the [ChangeLog](CHANGELOG.md) file for the detailed information.
2525

2626
DeepDiff 8-4-0
2727

28+
- Adding BaseOperatorPlus base class for custom operators
2829
- default_timezone can be passed now to set your default timezone to something other than UTC.
2930
- New summarization algorithm that produces valid json
3031
- Better type hint support

deepdiff/deephash.py

+27-19
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import pytz
33
import logging
44
import datetime
5-
from typing import Union
5+
from typing import Union, Optional, Any, List
66
from collections.abc import Iterable, MutableMapping
77
from collections import defaultdict
88
from hashlib import sha1, sha256
@@ -141,31 +141,32 @@ class DeepHash(Base):
141141
def __init__(self,
142142
obj,
143143
*,
144-
hashes=None,
145-
exclude_types=None,
144+
apply_hash=True,
145+
custom_operators: Optional[List[Any]] =None,
146+
default_timezone:Union[datetime.timezone, datetime.timezone, pytz.tzinfo.BaseTzInfo]=datetime.timezone.utc,
147+
encodings=None,
148+
exclude_obj_callback=None,
146149
exclude_paths=None,
147-
include_paths=None,
148150
exclude_regex_paths=None,
151+
exclude_types=None,
149152
hasher=None,
153+
hashes=None,
154+
ignore_encoding_errors=False,
155+
ignore_iterable_order=True,
156+
ignore_numeric_type_changes=False,
157+
ignore_private_variables=True,
150158
ignore_repetition=True,
151-
significant_digits=None,
152-
truncate_datetime=None,
153-
number_format_notation="f",
154-
apply_hash=True,
155-
ignore_type_in_groups=None,
159+
ignore_string_case=False,
156160
ignore_string_type_changes=False,
157-
ignore_numeric_type_changes=False,
161+
ignore_type_in_groups=None,
158162
ignore_type_subclasses=False,
159-
ignore_string_case=False,
160-
use_enum_value=False,
161-
exclude_obj_callback=None,
163+
include_paths=None,
164+
number_format_notation="f",
162165
number_to_string_func=None,
163-
ignore_private_variables=True,
164166
parent="root",
165-
encodings=None,
166-
ignore_encoding_errors=False,
167-
ignore_iterable_order=True,
168-
default_timezone:Union[datetime.timezone, datetime.timezone, pytz.tzinfo.BaseTzInfo]=datetime.timezone.utc,
167+
significant_digits=None,
168+
truncate_datetime=None,
169+
use_enum_value=False,
169170
**kwargs):
170171
if kwargs:
171172
raise ValueError(
@@ -192,7 +193,6 @@ def __init__(self,
192193
self.hashes[UNPROCESSED_KEY] = []
193194
self.use_enum_value = use_enum_value
194195
self.default_timezone = default_timezone
195-
196196
self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes)
197197
self.truncate_datetime = get_truncate_datetime(truncate_datetime)
198198
self.number_format_notation = number_format_notation
@@ -216,6 +216,7 @@ def __init__(self,
216216
self.encodings = encodings
217217
self.ignore_encoding_errors = ignore_encoding_errors
218218
self.ignore_iterable_order = ignore_iterable_order
219+
self.custom_operators = custom_operators
219220

220221
self._hash(obj, parent=parent, parents_ids=frozenset({get_id(obj)}))
221222

@@ -505,6 +506,13 @@ def _prep_tuple(self, obj, parent, parents_ids):
505506
def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET):
506507
"""The main hash method"""
507508
counts = 1
509+
if self.custom_operators is not None:
510+
for operator in self.custom_operators:
511+
func = getattr(operator, 'normalize_value_for_hashing', None)
512+
if func is None:
513+
raise NotImplementedError(f"{operator.__class__.__name__} needs to define a normalize_value_for_hashing method to be compatible with ignore_order=True or iterable_compare_func.".format(operator))
514+
else:
515+
obj = func(parent, obj)
508516

509517
if isinstance(obj, booleanTypes):
510518
obj = self._prep_bool(obj)

deepdiff/diff.py

+39-28
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ def _report_progress(_stats, progress_logger, duration):
112112
'encodings',
113113
'ignore_encoding_errors',
114114
'default_timezone',
115+
'custom_operators',
115116
)
116117

117118

@@ -130,6 +131,7 @@ def __init__(self,
130131
custom_operators: Optional[List[Any]] =None,
131132
cutoff_distance_for_pairs: float=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT,
132133
cutoff_intersection_for_pairs: float=CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT,
134+
default_timezone:Union[datetime.timezone, datetime.timezone, pytz.tzinfo.BaseTzInfo]=datetime.timezone.utc,
133135
encodings: Optional[List[str]]=None,
134136
exclude_obj_callback: Optional[Callable]=None,
135137
exclude_obj_callback_strict: Optional[Callable]=None,
@@ -156,6 +158,8 @@ def __init__(self,
156158
include_paths: Union[str, List[str], None]=None,
157159
iterable_compare_func: Optional[Callable]=None,
158160
log_frequency_in_sec: int=0,
161+
log_scale_similarity_threshold: float=0.1,
162+
log_stacktrace: bool=False,
159163
math_epsilon: Optional[float]=None,
160164
max_diffs: Optional[int]=None,
161165
max_passes: int=10000000,
@@ -164,15 +168,13 @@ def __init__(self,
164168
progress_logger: Callable=logger.info,
165169
report_repetition: bool=False,
166170
significant_digits: Optional[int]=None,
167-
use_log_scale: bool=False,
168-
log_scale_similarity_threshold: float=0.1,
169171
threshold_to_diff_deeper: float = 0.33,
170172
truncate_datetime: Optional[str]=None,
171173
use_enum_value: bool=False,
174+
use_log_scale: bool=False,
172175
verbose_level: int=1,
173176
view: str=TEXT_VIEW,
174177
zip_ordered_iterables: bool=False,
175-
default_timezone:Union[datetime.timezone, datetime.timezone, pytz.tzinfo.BaseTzInfo]=datetime.timezone.utc,
176178
_parameters=None,
177179
_shared_parameters=None,
178180
**kwargs):
@@ -186,7 +188,7 @@ def __init__(self,
186188
"ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, "
187189
"view, hasher, hashes, max_passes, max_diffs, zip_ordered_iterables, "
188190
"cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, "
189-
"cache_tuning_sample_size, get_deep_distance, group_by, group_by_sort_key, cache_purge_level, "
191+
"cache_tuning_sample_size, get_deep_distance, group_by, group_by_sort_key, cache_purge_level, log_stacktrace,"
190192
"math_epsilon, iterable_compare_func, use_enum_value, _original_type, threshold_to_diff_deeper, default_timezone "
191193
"ignore_order_func, custom_operators, encodings, ignore_encoding_errors, use_log_scale, log_scale_similarity_threshold "
192194
"_parameters and _shared_parameters.") % ', '.join(kwargs.keys()))
@@ -209,6 +211,7 @@ def __init__(self,
209211
self.log_scale_similarity_threshold = log_scale_similarity_threshold
210212
self.use_log_scale = use_log_scale
211213
self.default_timezone = default_timezone
214+
self.log_stacktrace = log_stacktrace
212215
self.threshold_to_diff_deeper = threshold_to_diff_deeper
213216
self.ignore_string_type_changes = ignore_string_type_changes
214217
self.ignore_type_in_groups = self.get_ignore_types_in_groups(
@@ -276,6 +279,10 @@ def _group_by_sort_key(x):
276279
self.cache_size = cache_size
277280
_parameters = self.__dict__.copy()
278281
_parameters['group_by'] = None # overwriting since these parameters will be passed on to other passes.
282+
if log_stacktrace:
283+
self.log_err = logger.exception
284+
else:
285+
self.log_err = logger.error
279286

280287
# Non-Root
281288
if _shared_parameters:
@@ -736,7 +743,7 @@ def _compare_in_order(
736743
self, level,
737744
t1_from_index=None, t1_to_index=None,
738745
t2_from_index=None, t2_to_index=None
739-
):
746+
) -> List[Tuple[Tuple[int, int], Tuple[Any, Any]]]:
740747
"""
741748
Default compare if `iterable_compare_func` is not provided.
742749
This will compare in sequence order.
@@ -756,7 +763,7 @@ def _get_matching_pairs(
756763
self, level,
757764
t1_from_index=None, t1_to_index=None,
758765
t2_from_index=None, t2_to_index=None
759-
):
766+
) -> List[Tuple[Tuple[int, int], Tuple[Any, Any]]]:
760767
"""
761768
Given a level get matching pairs. This returns list of two tuples in the form:
762769
[
@@ -1088,44 +1095,48 @@ def _create_hashtable(self, level, t):
10881095
# It only includes the ones needed when comparing iterables.
10891096
# The self.hashes dictionary gets shared between different runs of DeepHash
10901097
# So that any object that is already calculated to have a hash is not re-calculated.
1091-
deep_hash = DeepHash(item,
1092-
hashes=self.hashes,
1093-
parent=parent,
1094-
apply_hash=True,
1095-
**self.deephash_parameters,
1096-
)
1098+
deep_hash = DeepHash(
1099+
item,
1100+
hashes=self.hashes,
1101+
parent=parent,
1102+
apply_hash=True,
1103+
**self.deephash_parameters,
1104+
)
10971105
except UnicodeDecodeError as err:
10981106
err.reason = f"Can not produce a hash for {level.path()}: {err.reason}"
10991107
raise
1100-
except Exception as e: # pragma: no cover
1101-
logger.error("Can not produce a hash for %s."
1102-
"Not counting this object.\n %s" %
1103-
(level.path(), e))
1108+
except NotImplementedError:
1109+
raise
1110+
# except Exception as e: # pragma: no cover
1111+
# logger.error("Can not produce a hash for %s."
1112+
# "Not counting this object.\n %s" %
1113+
# (level.path(), e))
11041114
else:
11051115
try:
11061116
item_hash = deep_hash[item]
11071117
except KeyError:
11081118
pass
11091119
else:
11101120
if item_hash is unprocessed: # pragma: no cover
1111-
logger.warning("Item %s was not processed while hashing "
1121+
self.log_err("Item %s was not processed while hashing "
11121122
"thus not counting this object." %
11131123
level.path())
11141124
else:
11151125
self._add_hash(hashes=local_hashes, item_hash=item_hash, item=item, i=i)
11161126

11171127
# Also we hash the iterables themselves too so that we can later create cache keys from those hashes.
1118-
try:
1119-
DeepHash(
1120-
obj,
1121-
hashes=self.hashes,
1122-
parent=level.path(),
1123-
apply_hash=True,
1124-
**self.deephash_parameters,
1125-
)
1126-
except Exception as e: # pragma: no cover
1127-
logger.error("Can not produce a hash for iterable %s. %s" %
1128-
(level.path(), e))
1128+
DeepHash(
1129+
obj,
1130+
hashes=self.hashes,
1131+
parent=level.path(),
1132+
apply_hash=True,
1133+
**self.deephash_parameters,
1134+
)
1135+
# try:
1136+
# except Exception as e: # pragma: no cover
1137+
# import pytest; pytest.set_trace()
1138+
# self.log_err("Can not produce a hash for iterable %s. %s" %
1139+
# (level.path(), e))
11291140
return local_hashes
11301141

11311142
@staticmethod

deepdiff/operator.py

+32-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,41 @@
11
import re
2+
from typing import Any, Optional, List
3+
from abc import ABCMeta, abstractmethod
24
from deepdiff.helper import convert_item_or_items_into_compiled_regexes_else_none
35

46

7+
8+
class BaseOperatorPlus(metaclass=ABCMeta):
9+
10+
@abstractmethod
11+
def match(self, level) -> bool:
12+
"""
13+
Given a level which includes t1 and t2 in the tree view, is this operator a good match to compare t1 and t2?
14+
If yes, we will run the give_up_diffing to compare t1 and t2 for this level.
15+
"""
16+
pass
17+
18+
@abstractmethod
19+
def give_up_diffing(self, level, diff_instance: float) -> bool:
20+
"""
21+
Given a level which includes t1 and t2 in the tree view, and the "distance" between l1 and l2.
22+
do we consider t1 and t2 to be equal or not. The distance is a number between zero to one and is calculated by DeepDiff to measure how similar objects are.
23+
"""
24+
25+
@abstractmethod
26+
def normalize_value_for_hashing(self, parent: Any, obj: Any) -> Any:
27+
"""
28+
You can use this function to normalize values for ignore_order=True
29+
30+
For example, you may want to turn all the words to be lowercase. Then you return obj.lower()
31+
"""
32+
pass
33+
34+
35+
536
class BaseOperator:
637

7-
def __init__(self, regex_paths=None, types=None):
38+
def __init__(self, regex_paths:Optional[List[str]]=None, types:Optional[List[type]]=None):
839
if regex_paths:
940
self.regex_paths = convert_item_or_items_into_compiled_regexes_else_none(regex_paths)
1041
else:

0 commit comments

Comments
 (0)