Skip to content

Commit

Permalink
fix/refactor(id_generation): Use dill.dumps as deterministic id gener…
Browse files Browse the repository at this point in the history
…ator
  • Loading branch information
nkemnitz committed Jan 4, 2024
1 parent c1fd9af commit 4c1ac9a
Showing 1 changed file with 11 additions and 62 deletions.
73 changes: 11 additions & 62 deletions zetta_utils/mazepa/id_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import uuid
from typing import Callable, Optional

import dill
import xxhash
from coolname import generate_slug

Expand Down Expand Up @@ -34,74 +35,22 @@ def get_unique_id(
return result


def _get_code_hash(
fn: Callable, _hash: Optional[xxhash.xxh128] = None, _visited: Optional[set[int]] = None
) -> xxhash.xxh128:
if _hash is None:
_hash = xxhash.xxh128()
if _visited is None:
_visited = set()

# Check to prevent infinite recursion
# This is a bit silly, as the entire custom code hashing endeavor is done to avoid
# issues with Python's code hash in the first place...
# However, PYTHONHASHSEED is not an issue for tracking methods within the same session.
# Generating recursive loops with the same code hash requires some effort by the user
if id(fn) in _visited:
return _hash

_visited.add(id(fn))

for attribute_name in {x for x in dir(fn) if not x.startswith("__")}:
attrib = getattr(fn.__class__, attribute_name, None)

if attrib is not None and isinstance(attrib, property):
_get_code_hash(attrib, _hash, _visited) # type: ignore
continue

attrib = getattr(fn, attribute_name)

if callable(attrib):
_get_code_hash(attrib, _hash, _visited)
else:
_hash.update(f"{attribute_name}: {attrib}".encode())

if hasattr(fn, "__self__") and fn.__self__ is not None:
_get_code_hash(fn.__self__, _hash, _visited)

try:
_get_code_hash(fn.__self__.__call__.__func__, _hash, _visited)
except AttributeError:
pass

try:
_hash.update(fn.__qualname__)
except AttributeError:
pass

try:
_hash.update(fn.__code__.co_code)
except AttributeError:
pass

return _hash


def generate_invocation_id(
fn: Callable,
args: list,
kwargs: dict,
prefix: Optional[str] = None,
):
# Decided against using Python `hash` due to randomized PYTHONHASHSEED, and
# https://github.com/python/cpython/issues/94155 - esp. wrt to `co_code` missing.
# Note that this check skips most code attributes, e.g. co_flags for performance reasons.

x = _get_code_hash(fn)

x.update(args.__repr__().encode())
x.update(kwargs.__repr__().encode())

x = xxhash.xxh128()
x.update(
dill.dumps(
(fn, args, kwargs),
protocol=dill.DEFAULT_PROTOCOL,
byref=False,
recurse=True,
fmode=dill.FILE_FMODE,
)
)
if prefix is not None:
return f"{prefix}-{x.hexdigest()}"
else:
Expand Down

0 comments on commit 4c1ac9a

Please sign in to comment.