Skip to content

Commit

Permalink
make requested changes
Browse files Browse the repository at this point in the history
  • Loading branch information
jacob50231 committed Dec 19, 2024
1 parent 0dcb2ca commit 2f2b213
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 61 deletions.
99 changes: 57 additions & 42 deletions gen3/tools/indexing/post_indexing_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from cdislogging import get_logger, get_stream_handler
from gen3.file import Gen3File
import requests
import re


logger = get_logger(__name__)
Expand All @@ -24,20 +25,19 @@ class GuidError(Exception):
pass


class ManifestError(Exception):
pass


class Record:
def __init__(self, guid, bucket, protocol, acl, access_token, commons, size):
def __init__(self, guid, bucket, protocol, acl, size):
self.guid = guid
self.bucket = bucket
self.protocol = protocol
self.acl = acl
self.commons = commons
self.size = size
self.response_status = -1
self.download_status = -1
self.headers = {
"accept": "application/json",
"authorization": f"bearer {access_token}",
}

def check_record(self, gen3file):
"""
Expand Down Expand Up @@ -70,30 +70,25 @@ def check_record(self, gen3file):
logger.info(f"Pre-signed url generation failed for record {self.guid}")
self.response_status = response_status

download_success = -1
if response_status == 200:
try:
download_success = requests.get(url).status_code
self.download_status = requests.get(url).status_code
logger.info(
f"Download process complete with status code {download_success}"
f"Download process complete with status code {self.download_status}"
)
except:
download_success = -1
self.download_status = download_success
except Exception as err:
self.download_status = -1
logger.info(
f"Download process failed due to {err}"
) # maybe should be logger.error
return


class Records:
class RecordParser:
def __init__(self, auth):
self.auth = auth
self.access_token = auth.get_access_token()
self.commons = auth.endpoint
self.record_dict = {}
self.record_sizes = {}
self.headers = {
"accept": "application/json",
"authorization": f"bearer {self.access_token}",
}

def read_records_from_manifest(self, manifest):
"""
Expand All @@ -102,10 +97,17 @@ def read_records_from_manifest(self, manifest):
Args:
manifest (str): the location of a manifest file
"""
if manifest[-3:] == "tsv":
tsv_pattern = "^.*\.tsv$"
csv_pattern = "^.*\.csv$"
if re.match(tsv_pattern, manifest):
sep = "\t"
else:
elif re.match(csv_pattern, manifest):
sep = ","
else:
raise ManifestError(
"Please enter the path to a valid manifest in .csv or .tsv format"
)

with open(manifest, mode="r") as f:
csv_reader = csv.DictReader(f, delimiter=sep)
rows = [row for row in csv_reader]
Expand All @@ -118,32 +120,45 @@ def read_records_from_manifest(self, manifest):
"Manifest file has no column named 'GUID', 'guid', or 'id'"
)

url_pattern = r"^[a-zA-Z][a-zA-Z0-9+.-]*://[^\s/]+(?:/[^\s]*)*$"
for row in rows:
url_parsed = False
size = row["size"]
guid = row[guid_col]
for acl in row["acl"].split(" "):
if acl != "admin":
for url in row["url"].split(" "):
if "://" not in url:
continue
url = url.replace("[", "").replace("]", "")
if re.match(url_pattern, url) == None:
raise ManifestError(
f"Manifest contains guid {guid} with invalid url format: {url}"
)
else:
protocol, bucket = (
url.split("://")[0].replace("[", ""),
url.split("/")[2],
)
key = (bucket, protocol, acl)
if key not in self.record_dict or (
int(self.record_dict[key].size) >= int(size)
and int(size) != 0
):
key = (
bucket,
protocol,
acl,
) # Check a record for each unique (bucket, protocol, acl) combination
if (
key not in self.record_dict
or ( # Add record to the list of records if no matching (bucket,protocol,acl) found
int(self.record_dict[key].size)
>= int(
size
) # Update to download smallest non-zero sized record
and int(size)
!= 0 # Make sure record has non-zero size
)
): # If it passes these we temporarily choose this record to check for the bucket, protocol, and acl
record = Record(
guid,
bucket,
protocol,
acl,
self.access_token,
self.commons,
size,
)
self.record_dict[key] = record
Expand Down Expand Up @@ -179,9 +194,9 @@ def save_download_check_results_to_csv(self, csv_filename):
Args:
csv_filename (str): the relative file path of the output csv
"""
download_results = []
self.download_results = []
for record in self.record_dict.values():
download_results.append(
self.download_results.append(
{
"acl": record.acl,
"bucket": record.bucket,
Expand All @@ -192,10 +207,10 @@ def save_download_check_results_to_csv(self, csv_filename):
}
)

self.download_results = download_results
self.download_results = self.download_results

# Check if the results list is empty
if not download_results:
if not self.download_results:
logger.warning("No results to save.")
return

Expand All @@ -216,15 +231,15 @@ def save_download_check_results_to_csv(self, csv_filename):
writer.writeheader()

# Iterate through the DownloadCheckResult instances and write each row
for result in download_results:
for result in self.download_results:
writer.writerow(
{
"ACL": result["acl"],
"Bucket": result["bucket"],
"Protocol": result["protocol"],
"Presigned URL Status": result["presigned_url_status"],
"Download Status": result["download_status"],
"GUID": result["guid"],
"acl": result["acl"],
"bucket": result["bucket"],
"protocol": result["protocol"],
"presigned_url_status": result["presigned_url_status"],
"download_status": result["download_status"],
"guid": result["guid"],
}
)

Expand All @@ -243,7 +258,7 @@ def validate_manifest(MANIFEST, auth, output_file="results.csv"):
auth (str): a Gen3Auth instance
"""
logger.info("Starting...")
records = Records(auth)
records = RecordParser(auth)
records.read_records_from_manifest(MANIFEST)
records.check_records()
records.save_download_check_results_to_csv(output_file)
Expand Down
79 changes: 60 additions & 19 deletions tests/test_post_indexing_validation.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import pytest
import os
from gen3.tools.indexing.post_indexing_validation import (
Record,
Records,
validate_manifest,
ManifestError,
)
from unittest import mock
from unittest.mock import MagicMock, AsyncMock, mock_open
from aiohttp import ClientResponse
from unittest.mock import MagicMock, mock_open
import gen3
import time
import base64
Expand All @@ -16,7 +14,8 @@
cwd = os.path.dirname(os.path.realpath(__file__))


read_data = "guid,md5,size,acl,url\n255e396f-f1f8-11e9-9a07-0a80fada099c,473d83400bc1bc9dc635e334faddf33d,363455714,['Open'],['s3://pdcdatastore/test1.raw']\n255e396f-f1f8-11e9-9a07-0a80fada098d,473d83400bc1bc9dc635e334faddd33c,343434344,['Open'],['s3://pdcdatastore/test2.raw']\n255e396f-f1f8-11e9-9a07-0a80fada097c,473d83400bc1bc9dc635e334fadd433c,543434443,['phs0001'],['s3://pdcdatastore/test3.raw']"
read_data = "guid,md5,size,acl,url\n255e396f-f1f8-11e9-9a07-0a80fada099c,473d83400bc1bc9dc635e334faddf33d,363455714,[Open],[s3://pdcdatastore/test1.raw]\n255e396f-f1f8-11e9-9a07-0a80fada098d,473d83400bc1bc9dc635e334faddd33c,343434344,[Open],[s3://pdcdatastore/test2.raw]\n255e396f-f1f8-11e9-9a07-0a80fada097c,473d83400bc1bc9dc635e334fadd433c,543434443,[phs0001],[s3://pdcdatastore/test3.raw]"
read_data2 = "guid,md5,size,acl,url\n255e396f-f1f8-11e9-9a07-0a80fada099c,473d83400bc1bc9dc635e334faddf33d,363455714,[Open],[s3://pdcdatastore/test1.raw]\n255e396f-f1f8-11e9-9a07-0a80fada098d,473d83400bc1bc9dc635e334faddd33c,343434344,[Open],[s3://pdcdatastore/test2.raw]\n255e396f-f1f8-11e9-9a07-0a80fada097c,473d83400bc1bc9dc635e334fadd433c,543434443,[phs0001],[invalid_format]"


class MockResponse:
Expand Down Expand Up @@ -76,7 +75,6 @@ def test_validate_manifest_coro_with_200():
output = f"{cwd}/test_data/post_indexing_output.csv"

auth = gen3.auth.Gen3Auth(refresh_token=test_key)
auth.get_access_token()
records = validate_manifest(input, auth, output)

mock_file_open.assert_called_with(
Expand All @@ -85,12 +83,12 @@ def test_validate_manifest_coro_with_200():
mock_writeheader.assert_called_once()
mock_writerow.assert_called_with(
{
"ACL": "['phs0001']",
"Bucket": "pdcdatastore",
"Protocol": "'s3",
"Presigned URL Status": 200,
"Download Status": 200,
"GUID": "255e396f-f1f8-11e9-9a07-0a80fada097c",
"acl": "[phs0001]",
"bucket": "pdcdatastore",
"protocol": "s3",
"presigned_url_status": 200,
"download_status": 200,
"guid": "255e396f-f1f8-11e9-9a07-0a80fada097c",
}
)

Expand Down Expand Up @@ -148,7 +146,6 @@ def test_validate_manifest_coro_with_401():
output = f"{cwd}/test_data/post_indexing_output.csv"

auth = gen3.auth.Gen3Auth(refresh_token=test_key)
auth.get_access_token()
records = validate_manifest(input, auth, output)

mock_file_open.assert_called_with(
Expand All @@ -157,12 +154,12 @@ def test_validate_manifest_coro_with_401():
mock_writeheader.assert_called_once()
mock_writerow.assert_called_with(
{
"ACL": "['phs0001']",
"Bucket": "pdcdatastore",
"Protocol": "'s3",
"Presigned URL Status": 401,
"Download Status": -1,
"GUID": "255e396f-f1f8-11e9-9a07-0a80fada097c",
"acl": "[phs0001]",
"bucket": "pdcdatastore",
"protocol": "s3",
"presigned_url_status": 401,
"download_status": -1,
"guid": "255e396f-f1f8-11e9-9a07-0a80fada097c",
}
)

Expand All @@ -172,3 +169,47 @@ def test_validate_manifest_coro_with_401():
presigned_statuses.append(record["presigned_url_status"])
download_statuses.append(record["download_status"])
assert set(presigned_statuses) == {401} and set(download_statuses) == {-1}


def test_validate_manifest_coro_with_invalid_url():
hostname = "test.datacommons.io"
exp = time.time() + 300
decoded_info = {"aud": "123", "exp": exp, "iss": f"http://{hostname}"}
test_key = {
"api_key": "whatever." # pragma: allowlist secret
+ base64.urlsafe_b64encode(
('{"iss": "http://%s", "exp": %d }' % (hostname, exp)).encode("utf-8")
).decode("utf-8")
+ ".whatever"
}
get_mock = MockResponse(200)
with mock.patch(
"gen3.auth.get_access_token_with_key"
) as mock_access_token, mock.patch(
"gen3.auth.Gen3Auth._write_to_file"
) as mock_write_to_file, mock.patch(
"gen3.auth.decode_token"
) as mock_decode_token, mock.patch(
"requests.get"
) as mock_request, mock.patch(
"csv.DictWriter.writerow"
) as mock_writerow, mock.patch(
"csv.DictWriter.writeheader"
) as mock_writeheader, mock.patch(
"builtins.open", mock_open(read_data=read_data2)
) as mock_file_open:
mock_access_token.return_value = "new_access_token"
mock_write_to_file().return_value = True
mock_decode_token.return_value = decoded_info
mock_request.return_value = get_mock
mock_writerow.return_value = None
mock_writeheader.return_value = None

input = f"{cwd}/test_data/manifest3.csv"
output = f"{cwd}/test_data/post_indexing_output.csv"

auth = gen3.auth.Gen3Auth(refresh_token=test_key)
try:
records = validate_manifest(input, auth, output)
except ManifestError:
assert True

0 comments on commit 2f2b213

Please sign in to comment.