Skip to content

Commit

Permalink
Fixed md5 checksum calculations for large files (#220)
Browse files Browse the repository at this point in the history
* Fixed md5 checksum calculations for large files. Also enabled Python 3.12 and disabled 3.7
---------

Co-authored-by: Adrian Damian <[email protected]>
  • Loading branch information
andamian and Adrian Damian authored Dec 23, 2024
1 parent f48080c commit c343751
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 28 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/cibuild.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
- name: Install tox
run: python -m pip install --upgrade tox
- name: egg-info cadcutils
Expand All @@ -32,7 +32,7 @@ jobs:
strategy:
fail-fast: true
matrix:
python-version: ["3.7","3.8","3.9","3.10","3.11"]
python-version: ["3.8","3.9","3.10","3.11","3.12"]
package: [cadcutils, cadcdata, cadctap]
steps:
- name: Checkout code
Expand All @@ -58,10 +58,10 @@ jobs:
needs: tests
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
- name: Setup Graphviz
uses: ts-graphviz/setup-graphviz@v2
- name: Install tox
Expand Down
8 changes: 0 additions & 8 deletions cadcdata/cadcdata/storageinv.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,14 +701,6 @@ def _get_transfer_urls(self, id, params=None, is_get=True):
direction='pullFromVoSpace' if is_get else 'pushToVoSpace',
with_uws_job=False, cutout=params)

def _get_md5sum(self, filename):
# return the md5sum of a file
hash_md5 = hashlib.md5()
with open(filename, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
hash_md5.update(chunk)
return hash_md5.hexdigest()

def _get_uris(self, target):
# takes a target URI and if the URI is not fully qualified (schema
# is missing, returns a list of possible fully qualified
Expand Down
2 changes: 1 addition & 1 deletion cadcdata/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name = cadcdata

[tox]
envlist =
py{37,38,39,310,311}
py{38,39,310,311,312,313}
requires =
pip >= 19.3.1

Expand Down
2 changes: 1 addition & 1 deletion cadctap/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name = cadctap

[tox]
envlist =
py{37,38,39,310,311}
py{38,39,310,311,312,313}
requires =
pip >= 19.3.1

Expand Down
4 changes: 3 additions & 1 deletion cadcutils/cadcutils/net/tests/test_groups_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,9 @@ def test_mainapp_get(get_group_mock, basews_mock):
get_group_mock.reset_mock()
sys.argv = 'cadc-groups get --cert cert.pem grID1 grID2'.split()
main_app()
get_group_mock.has_calls([call(group_id='grID1'), call(group_id='grID2')])
assert 2 == get_group_mock.call_count
for ii in get_group_mock.calls:
assert ii in [call(group_id='grID1'), call(group_id='grID2')]


@patch('cadcutils.net.groups_client.Subject.from_cmd_line_args',
Expand Down
34 changes: 24 additions & 10 deletions cadcutils/cadcutils/net/ws.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,11 +493,7 @@ def combine_headers(new_headers):

src_md5 = md5_checksum
if not src_md5 and stat_info.st_size <= MAX_MD5_COMPUTE_SIZE:
hash_md5 = hashlib.md5()
with open(src, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
src_md5 = hash_md5.hexdigest()
src_md5 = BaseDataClient.compute_file_md5(src)

dest_name = os.path.basename(urlparse(url).path)

Expand Down Expand Up @@ -767,6 +763,27 @@ def _resolve_destination_file(dest, src_md5, default_file_name):
else:
return final_dest, final_dest

def compute_file_md5(file_path, undigested=False):
"""Compute the md5 hash of a file.
Args:
file_path (pathlib.Path): the path to the file.
undigested (bool): if True, returns the md5 hash object instead of
the hexdigest. Caller can continue to update the hash before calling
hexdigest
Returns:
str: the md5 hash of the file.
"""
md5_hash = hashlib.md5()
buffer_size = 8 * 1024
with open(file_path, 'rb') as file:
while file_buffer := file.read(buffer_size):
md5_hash.update(file_buffer)
if undigested:
return md5_hash
else:
return md5_hash.hexdigest()

def download_file(self, url, dest=None, **kwargs):
"""Method to download a file from CADC storage (archive or vospace).
This method takes advantage of the HTTP Range feature available
Expand Down Expand Up @@ -794,8 +811,7 @@ def download_file(self, url, dest=None, **kwargs):
dest=dest, src_md5=src_md5, default_file_name=content_disp)
if os.path.isfile(final_dest) and \
src_size == os.stat(final_dest).st_size and \
src_md5 == \
hashlib.md5(open(final_dest, 'rb').read()).hexdigest():
src_md5 == BaseDataClient.compute_file_md5(final_dest):
# nothing to be done
self.logger.info(
'Source and destination identical for {}. Skip transfer!'.format(final_dest))
Expand Down Expand Up @@ -883,10 +899,8 @@ def get_instance(self, block_size):
if response.status_code == requests.codes.partial_content:
# Can resume download. Digest existing content on disk first
update_mode = 'ab'
hash_md5 = BaseDataClient.compute_file_md5(dest_file, undigested=True)
dest_length = os.stat(dest_file).st_size
with open(dest_file, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
else:
os.remove(dest_file)

Expand Down
2 changes: 1 addition & 1 deletion cadcutils/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ url = http://www.cadc-ccda.hia-iha.nrc-cnrc.gc.ca
edit_on_github = False
github_project = opencadc/cadctools
# version should be PEP386 compatible (http://www.python.org/dev/peps/pep-0386)
version = 1.5.3
version = 1.5.4

[options]
install_requires =
Expand Down
2 changes: 1 addition & 1 deletion cadcutils/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name = cadcutils

[tox]
envlist =
py{37,38,39,310,311}
py{37,38,39,310,311,312,313}
requires =
pip >= 19.3.1

Expand Down

0 comments on commit c343751

Please sign in to comment.