Skip to content

Commit

Permalink
Merge pull request #120 from ascmitc/dev/multiple-checksum-output
Browse files Browse the repository at this point in the history
Adding support for multiple checksum formats for create command
  • Loading branch information
mthudgins authored Mar 14, 2022
2 parents a69bfa4 + 4d88065 commit 32bcb61
Show file tree
Hide file tree
Showing 10 changed files with 687 additions and 151 deletions.
400 changes: 299 additions & 101 deletions ascmhl/commands.py

Large diffs are not rendered by default.

141 changes: 137 additions & 4 deletions ascmhl/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,78 @@ def __init__(self, history: MHLHistory, ignore_spec: MHLIgnoreSpec = MHLIgnoreSp
self.new_hash_lists = defaultdict(MHLHashList)
self.ignore_spec = ignore_spec

def append_multiple_format_file_hashes(
self, file_path, file_size, hash_lookup: Dict[str, str], file_modification_date, action=None, hash_date=None
) -> bool:
"""
Adds file hashes to the history
:param file_path: a string value representing the path to a file
:param file_size: size of the file path in bytes
:param hash_lookup: a dictionary of hash values keyed by the respective hash format
:param file_modification_date: date the file was last modified
:param action: a predetermined action for the entry. defaults to none
:param hash_date: date the hashes were generated
:return a bool indicating if the hashes were successfully appended. returns false if any failures occur
"""
relative_path = self.root_history.get_relative_file_path(file_path)
# TODO: handle if path is outside of history root path
# Keep track of the number of failures
failures = 0
history, history_relative_path = self.root_history.find_history_for_path(relative_path)
# for collections we cannot create a valid relative path (we are in the "wrong" history), but in that case
# the file_path is inputted already as the relative path (a bit of implicit functionality here)
if history_relative_path == None:
history_relative_path = file_path

# check if there is an existing hash in the other generations and verify
original_hash_entry = history.find_original_hash_entry_for_path(history_relative_path)

hash_entries = [MHLHashEntry]
# TODO: sort the format keys into a standard order for consistent output
for hash_format, hash_string in hash_lookup.items():
hash_entry = MHLHashEntry(hash_format, hash_string, hash_date=hash_date)
if original_hash_entry is None:
hash_entry.action = "original"
logger.verbose(f" created original hash for {relative_path} {hash_format}: {hash_string}")
else:
existing_hash_entry = history.find_first_hash_entry_for_path(history_relative_path, hash_format)
if existing_hash_entry is not None:
if existing_hash_entry.hash_string == hash_string:
hash_entry.action = "verified"
logger.verbose(f" verified {relative_path} {hash_format}: OK")
else:
hash_entry.action = "failed"
failures += 1
logger.error(
f"ERROR: hash mismatch for {relative_path} "
f"{hash_format} (old): {existing_hash_entry.hash_string}, "
f"{hash_format} (new): {hash_string}"
)
else:
# in case there is no hash entry for this hash format yet
hash_entry.action = ( # mark as 'new' here, will be changed to verified in _validate_new_hash_list
"new"
)
logger.verbose(f" created new (verif.) hash for {relative_path} {hash_format}: {hash_string}")
# collection behavior: overwrite action with action from flattened history
if action != None:
hash_entry.action = action

# Add the generated entry to the list
hash_entries.append(hash_entry)

# in case the same file is hashes multiple times we want to add all hash entries
new_hash_list = self.new_hash_lists[history]
media_hash = new_hash_list.find_or_create_media_hash_for_path(
history_relative_path, file_size, file_modification_date
)

# Add the new hash entries
for hash_entry in hash_entries:
media_hash.append_hash_entry(hash_entry)

return failures == 0

def append_file_hash(
self, file_path, file_size, file_modification_date, hash_format, hash_string, action=None, hash_date=None
) -> bool:
Expand All @@ -66,7 +138,7 @@ def append_file_hash(
if existing_hash_entry is not None:
if existing_hash_entry.hash_string == hash_string:
hash_entry.action = "verified"
logger.verbose(f" verified {relative_path} OK")
logger.verbose(f" verified {relative_path} {hash_format}: OK")
else:
hash_entry.action = "failed"
logger.error(
Expand All @@ -77,9 +149,7 @@ def append_file_hash(
else:
# in case there is no hash entry for this hash format yet
hash_entry.action = "new" # mark as 'new' here, will be changed to verified in _validate_new_hash_list
logger.verbose(
f" created new, verified hash for {relative_path} {hash_format}: {hash_string}"
)
logger.verbose(f" created new (verif.) hash for {relative_path} {hash_format}: {hash_string}")

# in case the same file is hashes multiple times we want to add all hash entries
new_hash_list = self.new_hash_lists[history]
Expand All @@ -94,6 +164,69 @@ def append_file_hash(
media_hash.append_hash_entry(hash_entry)
return hash_entry.action != "failed"

def append_multiple_format_directory_hashes(
self, path, modification_date, content_hash_lookup: Dict[str, str], structure_hash_lookup: Dict[str, str]
) -> None:
"""
Adds directory hashes to the history
:param path: a string value representing the path to a file
:param modification_date: date the file was last modified
:param content_hash_lookup: a dictionary of content hash values keyed by the respective hash format
:param structure_hash_lookup: a dictionary of structure hash values keyed by the respective hash format
:return: none
"""
relative_path = self.root_history.get_relative_file_path(path)
# TODO: handle if path is outside of history root path

history, history_relative_path = self.root_history.find_history_for_path(relative_path)

# in case the same file is hashes multiple times we want to add all hash entries
new_hash_list = self.new_hash_lists[history]
media_hash = new_hash_list.find_or_create_media_hash_for_path(history_relative_path, None, modification_date)
media_hash.is_directory = True

# Add the content entries
if content_hash_lookup:
for hash_format, content_hash_string in content_hash_lookup.items():
# Find the structure hash string
structure_hash_string = structure_hash_lookup[hash_format]

hash_entry = MHLHashEntry(hash_format, content_hash_string)
# Attempt to add the structure, if available
hash_entry.structure_hash_string = structure_hash_string
media_hash.append_hash_entry(hash_entry)

if relative_path == ".":
logger.verbose(
f" calculated root hash {hash_format}: "
f"{content_hash_string} (content), "
f"{structure_hash_string} (structure)"
)
else:
logger.verbose(
f" calculated directory hash for {relative_path} {hash_format}: "
f"{content_hash_string} (content), "
f"{structure_hash_string} (structure)"
)
else:
logger.verbose(f" added directory entry for {relative_path}")

# in case we just created the root media hash of the current hash list we also add it one history level above
if new_hash_list.process_info.root_media_hash is media_hash and history.parent_history:
parent_history = history.parent_history
parent_relative_path = parent_history.get_relative_file_path(path)
parent_hash_list = self.new_hash_lists[parent_history]
parent_media_hash = parent_hash_list.find_or_create_media_hash_for_path(
parent_relative_path, None, modification_date
)
parent_media_hash.is_directory = True
if content_hash_lookup:
for hash_format, content_hash_string in content_hash_lookup.items():
structure_hash_string = structure_hash_lookup[hash_format]
hash_entry = MHLHashEntry(hash_format, content_hash_string)
hash_entry.structure_hash_string = structure_hash_string
parent_media_hash.append_hash_entry(hash_entry)

def append_directory_hashes(
self, path, modification_date, hash_format, content_hash_string, structure_hash_string
) -> None:
Expand Down
94 changes: 94 additions & 0 deletions ascmhl/hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import os
from enum import Enum, unique
from abc import ABC, abstractmethod
from typing import Dict


class Hasher(ABC):
Expand Down Expand Up @@ -242,12 +243,84 @@ class HashType(Enum):
c4 = C4


class AggregateHasher:
def __init__(self, hash_formats: [str]):

# Build a hasher for each format
hasher_lookup = Dict[str, str]
for hash_format in hash_formats:
hasher_lookup[hash_format] = new_hasher_for_hash_type(hash_format)

self.hash_formats = hash_formats
self.hasher_lookup = hasher_lookup

"""
Handles multiple hashing to facilitate a read-once create-many hashing paradigm
"""

@classmethod
def hash_file(cls, file_path: str, hash_formats: [str]) -> Dict[str, str]:
"""
computes and returns new hash strings for a file
arguments:
file_path -- string value, path of file to generate hash for.
hash_formats -- array string values, each entry should be one of the supported hash formats, e.g. 'md5', 'xxh64'
"""

# Build a hasher for each supplied format
hasher_lookup = {}
for hash_format in hash_formats:
hasher = new_hasher_for_hash_type(hash_format)
hasher_lookup[hash_format] = hasher

# Open the file
with open(file_path, "rb") as fd:
# process files in chunks so that large files won't cause excessive memory consumption.
size = 1024 * 1024 # chunk size 1MB
chunk = fd.read(size)
while chunk:
# Update each stored hasher with the read chunk
for hash_format in hasher_lookup:
hasher_lookup[hash_format].update(chunk)

chunk = fd.read(size)

# Get the digest from each hasher
hash_output_lookup = {}
for hash_format in hasher_lookup:
hash_output_lookup[hash_format] = hasher_lookup[hash_format].string_digest()

return hash_output_lookup

@classmethod
def hash_data(cls, input_data: bytes, hash_formats: [str]) -> Dict[str, str]:
"""
computes and returns new hash strings for a file
arguments:
input_data -- the bytes to compute the hash from.
hash_formats -- array string values, each entry should be one of the supported hash formats, e.g. 'md5', 'xxh64'
"""

# Build a hash for each supplied format
hash_output_lookup = {}
for hash_format in hash_formats:
hash_generator = new_hasher_for_hash_type(hash_format)
hash_generator.update(input_data)
computed_hash = hash_generator.string_digest()
hash_output_lookup[hash_format] = computed_hash

return hash_output_lookup


class DirectoryHashContext:
"""
DirectoryHashContext wraps the data necessary to compute directory checksums.
"""

def __init__(self, hash_format: str):

self.hash_format = hash_format
self.hasher = new_hasher_for_hash_type(hash_format)
self.content_hash_strings = []
Expand Down Expand Up @@ -318,6 +391,17 @@ def hash_of_hash_list(hash_list: [str], hash_format: str) -> str:
return hasher.hash_of_hash_list(hash_list)


def multiple_format_hash_file(file_path: str, hash_formats: [str]) -> Dict[str, str]:
"""
computes and returns a new hash strings for a file
arguments:
file_path -- string value, path of file to generate hash for.
hash_formats -- string values, each entry is one of the supported hash formats, e.g. 'md5', 'xxh64'
"""
return AggregateHasher.hash_file(file_path, hash_formats)


def hash_file(filepath: str, hash_format: str) -> str:
"""
computes and returns a new hash string for a file
Expand All @@ -342,6 +426,16 @@ def hash_data(input_data: bytes, hash_format: str) -> str:
return hasher.hash_data(input_data)


def multiple_format_hash_data(input_data: bytes, hash_formats: [str]) -> Dict[str, str]:
"""
computes and returns new hash strings from the input data
arguments:
input_data -- the bytes to compute the hash from
hash_formats -- string values, each entry is one of the supported hash formats, e.g. 'md5', 'xxh64'
"""
return AggregateHasher.hash_data(input_data, hash_formats)


def bytes_for_hash_string(hash_string: str, hash_format: str) -> bytes:
"""
wraps the different Hasher string to byte conversions
Expand Down
6 changes: 3 additions & 3 deletions examples/scenarios/Output/scenario_02/log.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ this will verify all hashes, check for completeness and create a second generati

$ ascmhl.py create -v /file_server/A002R2EC -h xxh64
Creating new generation for folder at path: /file_server/A002R2EC ...
verified Clips/A002C006_141024_R2EC.mov OK
verified Clips/A002C007_141024_R2EC.mov OK
verified Clips/A002C006_141024_R2EC.mov xxh64: OK
verified Clips/A002C007_141024_R2EC.mov xxh64: OK
calculated directory hash for Clips xxh64: 4c226b42e27d7af3 (content), 906faa843d591a9f (structure)
verified Sidecar.txt OK
verified Sidecar.txt xxh64: OK
calculated root hash xxh64: 8d02114c32e28cbe (content), f557f8ca8e5a88ef (structure)
Created new generation ascmhl/0002_A002R2EC_2020-01-17_143000.mhl

Expand Down
12 changes: 6 additions & 6 deletions examples/scenarios/Output/scenario_03/log.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ and create a second generation with additional (new) MD5 hashes.

$ ascmhl.py create -v -h md5 /file_server/A002R2EC
Creating new generation for folder at path: /file_server/A002R2EC ...
verified Clips/A002C006_141024_R2EC.mov OK
created new, verified hash for Clips/A002C006_141024_R2EC.mov md5: f5ac8127b3b6b85cdc13f237c6005d80
verified Clips/A002C007_141024_R2EC.mov OK
created new, verified hash for Clips/A002C007_141024_R2EC.mov md5: 614dd0e977becb4c6f7fa99e64549b12
verified Clips/A002C006_141024_R2EC.mov xxh64: OK
created new (verif.) hash for Clips/A002C006_141024_R2EC.mov md5: f5ac8127b3b6b85cdc13f237c6005d80
verified Clips/A002C007_141024_R2EC.mov xxh64: OK
created new (verif.) hash for Clips/A002C007_141024_R2EC.mov md5: 614dd0e977becb4c6f7fa99e64549b12
calculated directory hash for Clips md5: 202a2d71b56b080d9b089c1f4f29a4ba (content), 4a739024fd19d928e9dea6bb5c480200 (structure)
verified Sidecar.txt OK
created new, verified hash for Sidecar.txt md5: 6425c5a180ca0f420dd2b25be4536a91
verified Sidecar.txt xxh64: OK
created new (verif.) hash for Sidecar.txt md5: 6425c5a180ca0f420dd2b25be4536a91
calculated root hash md5: 6fae2da9bc6dca45486cb91bfea6db70 (content), be1f2eaed208efbed061845a64cacdfa (structure)
Created new generation ascmhl/0002_A002R2EC_2020-01-17_143000.mhl

Expand Down
4 changes: 2 additions & 2 deletions examples/scenarios/Output/scenario_04/log.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ An error is shown and create a new generation that documents the failed verifica

$ ascmhl.py create -v /file_server/A002R2EC -h xxh64
Creating new generation for folder at path: /file_server/A002R2EC ...
verified Clips/A002C006_141024_R2EC.mov OK
verified Clips/A002C007_141024_R2EC.mov OK
verified Clips/A002C006_141024_R2EC.mov xxh64: OK
verified Clips/A002C007_141024_R2EC.mov xxh64: OK
calculated directory hash for Clips xxh64: 4c226b42e27d7af3 (content), 906faa843d591a9f (structure)
ERROR: hash mismatch for Sidecar.txt xxh64 (old): 3ab5a4166b9bde44, xxh64 (new): 70d2cf31aaa3eac4
calculated root hash xxh64: 8e52e9c3d15e055c (content), 32706d5f4b48f047 (structure)
Expand Down
12 changes: 6 additions & 6 deletions examples/scenarios/Output/scenario_05/log.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ of the card sub folders.

$ ascmhl.py create -v /file_server/Reels -h xxh64
Creating new generation for folder at path: /file_server/Reels ...
verified A002R2EC/Clips/A002C006_141024_R2EC.mov OK
verified A002R2EC/Clips/A002C007_141024_R2EC.mov OK
verified A002R2EC/Clips/A002C006_141024_R2EC.mov xxh64: OK
verified A002R2EC/Clips/A002C007_141024_R2EC.mov xxh64: OK
calculated directory hash for A002R2EC/Clips xxh64: 4c226b42e27d7af3 (content), 906faa843d591a9f (structure)
verified A002R2EC/Sidecar.txt OK
verified A002R2EC/Sidecar.txt xxh64: OK
calculated directory hash for A002R2EC xxh64: 8d02114c32e28cbe (content), f557f8ca8e5a88ef (structure)
verified A003R2EC/Clips/A003C011_141024_R2EC.mov OK
verified A003R2EC/Clips/A003C012_141024_R2EC.mov OK
verified A003R2EC/Clips/A003C011_141024_R2EC.mov xxh64: OK
verified A003R2EC/Clips/A003C012_141024_R2EC.mov xxh64: OK
calculated directory hash for A003R2EC/Clips xxh64: f2afc6434255a53d (content), a25d5ca89c95f9e2 (structure)
verified A003R2EC/Sidecar.txt OK
verified A003R2EC/Sidecar.txt xxh64: OK
calculated directory hash for A003R2EC xxh64: 7a82373c131cf40a (content), 1131a950fcc55e4b (structure)
created original hash for Summary.txt xxh64: 0ac48e431d4538ba
calculated root hash xxh64: 92950bc8fda076ec (content), 2c2ce52605558158 (structure)
Expand Down
Loading

0 comments on commit 32bcb61

Please sign in to comment.