Skip to content

Commit

Permalink
feat!: use binary hash column (#105)
Browse files Browse the repository at this point in the history
* v0.7.0

* add "hash" column as raw bytes

* Dict4.V2

* fix V2 args

* Update CHANGELOG.md
  • Loading branch information
drernie committed Jan 14, 2024
1 parent 2d27f8c commit 641c782
Show file tree
Hide file tree
Showing 12 changed files with 51 additions and 22 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# CHANGELOG.md

## 0.7.0 (2024-01-12)

- Store un-hexed multihash in Dict4.hash
- Use Dict4.V2 to add that hash to the struct
- Drop version down to "v2" (from "v4")

## 0.6.2 (2023-10-22)

- Cleanup for demo

## 0.5.1 (2023-10-21)

- Remove legacy code
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "quiltcore"
version = "0.6.2"
version = "0.7.0"
description = "low-level plubming to read/write Quilt packages"
authors = ["Ernest Prabhakar <[email protected]>"]
readme = "README.md"
Expand Down
2 changes: 1 addition & 1 deletion quiltcore/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,4 @@ def commit(self, message: str = "Updated", user_meta: dict = {}) -> Multihash:
if user_meta:
setattr(self.header, self.K_USER_META, user_meta)
self.update()
return self.hash()
return self.hashify()
4 changes: 2 additions & 2 deletions quiltcore/table4.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def _get_table(self) -> pa.Table:

def _get_head(self) -> pa.Table:
"""Extract header values into attributes."""
return Dict4(**self.first())
return Dict4.V2(**self.first())

def _get_body(self) -> pa.Table:
"""
Expand All @@ -40,4 +40,4 @@ def get_dict4(self, key: str) -> Dict4:
"""Return the dict4 for a child resource."""
row = self.get_row(key)
assert row, f"Missing row for {key}"
return Dict4(**row)
return Dict4.V2(**row)
10 changes: 7 additions & 3 deletions quiltcore/udg/codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,14 @@ def digester(self, hash_type=None):
digest_type = self.hash_config(self.MH_DIG)[ht]
return multihash.get(digest_type)

def digest_raw(self, bstring: bytes) -> bytes:
"""return multihash digest as bytes"""
digester = self.digester()
return digester.digest(bstring)

def digest(self, bstring: bytes) -> Multihash:
"""return multihash digest as hex"""
digester = self.digester()
return digester.digest(bstring).hex()
return self.digest_raw(bstring).hex()

def decode_q3hash(self, q3hash: str) -> Multihash:
hash_type = self.config("hash_type")
Expand Down Expand Up @@ -212,7 +216,7 @@ def decode_dict3(self, row: Dict3) -> Dict4:
if decoded["info"] and self.K_USER_META in decoded["info"]:
decoded["meta"] = decoded["info"][self.K_USER_META]
del decoded["info"][self.K_USER_META]
return Dict4(**decoded)
return Dict4.V2(**decoded).recode_hash()

def decode_item(self, item, opts={}):
"""decode scalar or compound item"""
Expand Down
1 change: 1 addition & 0 deletions quiltcore/udg/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def HeaderDict4(
name=cls.HEADER_NAME,
place=cls.HEADER_NAME,
size=cls.SIZE,
hash=b"",
multihash=cls.MULTIHASH,
info={
cls.K_VERSION: version,
Expand Down
15 changes: 13 additions & 2 deletions quiltcore/udg/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class Types:
HEADER_NAME = "."
HEADER_V3 = "v0"
HEADER_V4 = "v4"
HEADER_V4 = "v2"
MULTIHASH = "1220"

IS_LOCAL = compile(r"file:\/*")
Expand Down Expand Up @@ -131,18 +131,29 @@ class Dict4(DataDict):
name: str
place: str
size: int
multihash: str
hash: bytes | None # raw binary
multihash: str # hex-encoded
info: dict # was (system) metadata
meta: dict # was user_meta
workflow: Optional[str] = None

@staticmethod
def V2(**dict4) -> "Dict4":
dict4["hash"] = bytes.fromhex(dict4["multihash"])
return Dict4(**dict4)

def recode_hash(self) -> "Dict4":
self.hash = bytes.fromhex(self.multihash)
return self

def to_parquet_dict(self) -> dict:
map = self.to_dict()
for field in Types.K_JSON_FIELDS:
if field in map:
json_field = f"{field}.json"
map[json_field] = json_dumps(map[field], default=str)
del map[field]

return map


Expand Down
12 changes: 7 additions & 5 deletions quiltcore/udg/verifiable.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def hashable_path(self) -> Path | None:

def hashable_values(self) -> str:
"""Concatenate the hashes of each Verifiable in values()."""
hashes = [v.hash() for v in self.values() if isinstance(v, Verifiable)]
hashes = [v.hashify() for v in self.values() if isinstance(v, Verifiable)]
return "".join(hashes)

def to_bytes(self) -> bytes:
Expand Down Expand Up @@ -84,17 +84,19 @@ def _multihash_contents(self) -> Multihash:
return self.digest_bytes(self.to_bytes())

def dict4_from_path(self, path: Path) -> Dict4:
raw_hash = self.cf.digest_raw(path.read_bytes())
base = Dict4(
name=path.name,
place="",
size=0,
multihash=self.digest_bytes(path.read_bytes()),
hash=raw_hash,
multihash=raw_hash.hex(),
info={},
meta={},
)
return self.UpdateDict4(base, path)

def hash(self) -> Multihash:
def hashify(self) -> Multihash:
"""Return (or calculate) the multihash of the contents."""
if self._hash is None or self.is_dirty():
self._hash = self._multihash_contents()
Expand All @@ -106,7 +108,7 @@ def q3hash_from_hash(self, mh: Multihash) -> str:

def q3hash(self) -> str:
"""Return the value portion of the legacy quilt3 hash."""
return self.q3hash_from_hash(self.hash())
return self.q3hash_from_hash(self.hashify())

#
# Hash retrieval
Expand All @@ -120,7 +122,7 @@ def verify(self, contents: bytes) -> bool:
"""Verify that multihash digest of bytes match the current multihash"""
digest = self.digest_bytes(contents)
logging.debug(f"verify.digest: {digest}")
return digest == self.hash()
return digest == self.hashify()


class VerifyDict(Verifiable):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def test_arrow_relax():
assert table4
meta = table4.head.info
assert meta
assert meta["version"] == "v4"
assert meta["version"] == "v2"
assert "ONLYME.md" in table4.keys()
entry = table4["ONLYME.md"]
assert entry
Expand Down
2 changes: 1 addition & 1 deletion tests/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,4 @@ def test_man_relax(man: Manifest, domain: Domain):
assert isinstance(local_man, Manifest)
assert local_man.path.exists()
assert domain.store in local_man.path.parents
assert local_man.hash() == man.hash() # TODO: force recalculation
assert local_man.hashify() == man.hashify() # TODO: force recalculation
3 changes: 2 additions & 1 deletion tests/test_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def test_node_dict4_to_meta3(node):
print(meta3)
assert isinstance(meta3, dict)
assert Child.K_USER_META in meta3
assert meta3["version"] == "v4"
assert meta3["version"] == "v2"
assert meta3["message"] == TEST_MSG
assert meta3[Child.K_USER_META] == TEST_META

Expand All @@ -176,6 +176,7 @@ def test_node_dict4_to_dict3(node):
dict4 = Dict4(
name="name",
place="s3://place/is here",
hash=b"12201234",
multihash="12201234",
size=123,
info={},
Expand Down
10 changes: 5 additions & 5 deletions tests/test_verifiable.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,35 +34,35 @@ def test_ver_raise(ver: Verifiable):
with raises(ValueError):
ver.to_bytes()
with raises(ValueError):
ver.hash()
ver.hashify()


def test_ver_dict(ver: Verifiable):
OLD_DICT = Verifiable.DEFAULT_DICT
Verifiable.DEFAULT_DICT = Verify.TEST_DICT
assert ver.hash() == Verify.HASH_DICT
assert ver.hashify() == Verify.HASH_DICT
Verifiable.DEFAULT_DICT = OLD_DICT


def test_ver_cache(ver: Verifiable):
ver["hash"] = Verify()
assert ver.to_bytes() == Verify.HASH_BYTES.encode("utf-8")
assert ver.hash() == Verify.HASH_HASH
assert ver.hashify() == Verify.HASH_HASH


def test_ver_path(ver: Verifiable):
with TemporaryDirectory() as tmpdirname:
path = Path(tmpdirname) / "test.txt"
path.write_bytes(Verify.TEST_BYTES)
ver.path = path # type: ignore
assert ver.hash() == Verify.HASH_BYTES
assert ver.hashify() == Verify.HASH_BYTES


def test_verify():
verify = Verify()
assert verify is not None

assert verify.hash() == Verify.HASH_BYTES
assert verify.hashify() == Verify.HASH_BYTES
assert verify.q3hash() == Verify.HASH_BYTES[4:]
assert verify.hashable() == b"{}"
assert verify.verify(b"") is False
Expand Down

0 comments on commit 641c782

Please sign in to comment.