Skip to content

Commit

Permalink
Add multiple file support and initial metadata validation
Browse files Browse the repository at this point in the history
  • Loading branch information
RohanBhattaraiNP authored Sep 26, 2024
1 parent 0f5add5 commit 9159d93
Show file tree
Hide file tree
Showing 3 changed files with 207 additions and 22 deletions.
8 changes: 3 additions & 5 deletions caltechdata_api/caltechdata_write.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import copy
import json
import os, requests

import os
import requests
import s3fs
from requests import session
from json.decoder import JSONDecodeError
Expand Down Expand Up @@ -49,8 +49,6 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal
infile = open(name, "rb")
else:
infile = open(f_list[name], "rb")
# size = infile.seek(0, 2)
# infile.seek(0, 0) # reset at beginning
result = requests.put(link, headers=f_headers, data=infile)
if result.status_code != 200:
raise Exception(result.text)
Expand All @@ -68,7 +66,7 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal
def add_file_links(
metadata, file_links, file_descriptions=[], additional_descriptions="", s3_link=None
):
# Currently configured for S3 links, assuming all are at same endpoint
# Currently configured for S3 links, assuming all are at the same endpoint
link_string = ""
endpoint = "https://" + file_links[0].split("/")[2]
s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint})
Expand Down
55 changes: 39 additions & 16 deletions caltechdata_api/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def decrypt_token(encrypted_token, key):
return f.decrypt(encrypted_token).decode()


# Function to get or set token
# Function to get or set token with support for test system
def get_or_set_token(production=True):
key = load_or_generate_key()

Expand Down Expand Up @@ -411,6 +411,7 @@ def main():

def create_record(production):
token = get_or_set_token(production)
# keep_file = input("Do you want to keep your existing files? (yes/no): ").lower() == "yes"
print("Using CaltechDATA token:", token)
while True:
choice = get_user_input(
Expand Down Expand Up @@ -521,13 +522,10 @@ def print_upload_message(rec_id, production):
else "https://data.caltechlibrary.dev/uploads/"
)
print(
f"""
You can view and publish this record at
f"""You can view and publish this record at
{base_url}{rec_id}
If you need to upload large files to S3, you can type `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`
"""
If you need to upload large files to S3, you can type
`s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`"""
)


Expand All @@ -552,7 +550,6 @@ def edit_record(production):
print(f"An error occurred during metadata editing: {e}")
else:
print("No metadata file found.")

choice = get_user_input("Do you want to add files? (y/n): ").lower()
if choice == "y":
if production:
Expand All @@ -571,19 +568,32 @@ def edit_record(production):
url = API_URL_TEMPLATE.format(record_id=record_id)
url_draft = API_URL_TEMPLATE_DRAFT.format(record_id=record_id)

response = requests.get(url)
response_draft = requests.get(url_draft)
headers = {
"accept": "application/json",
}

filepath, file_link = upload_supporting_file(record_id)
print(file_link)
if token:
headers["Authorization"] = "Bearer %s" % token

if response.status_code == 404 and response_draft.status_code == 404:
response = requests.get(url, headers=headers)
response_draft = requests.get(url_draft, headers=headers)
data = response.json()
data_draft = response_draft.json()
# Check if 'entries' exists and its length
if (
len(data.get("entries", [])) == 0
and len(data_draft.get("entries", [])) == 0
):
keepfile = False
else:
keepfile = (
input("Do you want to keep existing files? (y/n): ").lower() == "y"
)

filepath, file_link = upload_supporting_file(record_id)
if file_link:
print(file_link)

if filepath != "":
response = caltechdata_edit(
record_id,
Expand All @@ -601,7 +611,7 @@ def edit_record(production):
file_links=file_link,
production=production,
publish=False,
keepfile=keepfile,
keepfiles=keepfile,
)

rec_id = response
Expand All @@ -620,15 +630,28 @@ def download_file_by_id(record_id, token=None):

try:
response = requests.get(url, headers=headers)

if response.status_code != 200:
# Might have a draft
response = requests.get(
url + "/draft",
headers=headers,
)
if response.status_code != 200:
raise Exception(f"Record {record_id} does not exist, cannot edit")
url = f"https://data.caltechlibrary.dev/api/records/{record_id}"
response = requests.get(
url,
headers=headers,
)
if response.status_code != 200:
# Might have a draft
response = requests.get(
url + "/draft",
headers=headers,
)
if response.status_code != 200:
raise Exception(
f"Record {record_id} does not exist, cannot edit"
)
file_content = response.content
file_name = f"downloaded_data_{record_id}.json"
with open(file_name, "wb") as file:
Expand Down
166 changes: 165 additions & 1 deletion caltechdata_api/customize_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ def rdm_creators_contributors(person_list, peopleroles):

def customize_schema_rdm(json_record):
# Get vocabularies used in InvenioRDM
vocabularies = get_vocabularies()

vocabularies = get_vocabularies()
validate_metadata(json_record)
peopleroles = vocabularies["crr"]
resourcetypes = vocabularies["rsrct"]
descriptiontypes = vocabularies["dty"]
Expand Down Expand Up @@ -386,6 +387,169 @@ def customize_schema_rdm(json_record):
return final


def validate_metadata(json_record):
"""
Validates the presence and structure of required fields in a CaltechDATA JSON record.
Raises an exception if any required field is missing or structured incorrectly.
"""
errors = []

# Check for 'types' and 'resourceTypeGeneral'
if "types" not in json_record:
errors.append("'types' field is missing.")
elif not isinstance(json_record["types"], dict):
errors.append("'types' field should be a dictionary.")
elif "resourceTypeGeneral" not in json_record["types"]:
errors.append("'resourceTypeGeneral' field is missing in 'types'.")

# Check for 'title'
if "titles" not in json_record:
errors.append("'titles' field is missing.")
elif not isinstance(json_record["titles"], list) or len(json_record["titles"]) == 0:
errors.append("'titles' should be a non-empty list.")
else:
# Ensure each title is a dictionary with 'title' field
for title in json_record["titles"]:
if not isinstance(title, dict) or "title" not in title:
errors.append(
"Each entry in 'titles' must be a dictionary with a 'title' key."
)

# Check for 'publication_date'
if "publicationYear" not in json_record and "dates" not in json_record:
errors.append(
"A publication date is required ('publicationYear' or 'dates' field is missing)."
)
if "dates" in json_record:
if not isinstance(json_record["dates"], list):
errors.append("'dates' should be a list.")
else:
for date_entry in json_record["dates"]:
if (
not isinstance(date_entry, dict)
or "dateType" not in date_entry
or "date" not in date_entry
):
errors.append(
"Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys."
)

# Check for 'creators'
if "creators" not in json_record:
errors.append("'creators' field is missing.")
elif (
not isinstance(json_record["creators"], list)
or len(json_record["creators"]) == 0
):
errors.append("'creators' should be a non-empty list.")
else:
for creator in json_record["creators"]:
if not isinstance(creator, dict) or "name" not in creator:
errors.append(
"Each creator in 'creators' must be a dictionary with a 'name' key."
)

# Check for 'contributors'
if "contributors" in json_record:
if not isinstance(json_record["contributors"], list):
errors.append("'contributors' should be a list.")
else:
for contributor in json_record["contributors"]:
if not isinstance(contributor, dict) or "name" not in contributor:
errors.append(
"Each contributor must be a dictionary with a 'name' key."
)

# Check for 'resourceType'
if "resourceType" not in json_record["types"]:
errors.append("'resourceType' field is missing in 'types'.")
elif not isinstance(json_record["types"]["resourceType"], str):
errors.append("'resourceType' should be a string.")

# Check for 'identifiers'
if "identifiers" in json_record:
if not isinstance(json_record["identifiers"], list):
errors.append("'identifiers' should be a list.")
else:
for identifier in json_record["identifiers"]:
if (
not isinstance(identifier, dict)
or "identifier" not in identifier
or "identifierType" not in identifier
):
errors.append(
"Each identifier must be a dictionary with 'identifier' and 'identifierType' keys."
)

# Check for 'subjects'
if "subjects" in json_record:
if not isinstance(json_record["subjects"], list):
errors.append("'subjects' should be a list.")
else:
for subject in json_record["subjects"]:
if not isinstance(subject, dict) or "subject" not in subject:
errors.append(
"Each subject must be a dictionary with a 'subject' key."
)

# Check for 'relatedIdentifiers'
if "relatedIdentifiers" in json_record:
if not isinstance(json_record["relatedIdentifiers"], list):
errors.append("'relatedIdentifiers' should be a list.")
else:
for related_id in json_record["relatedIdentifiers"]:
if (
not isinstance(related_id, dict)
or "relatedIdentifier" not in related_id
):
errors.append(
"Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key."
)

# Check for 'rightsList'
if "rightsList" in json_record:
if not isinstance(json_record["rightsList"], list):
errors.append("'rightsList' should be a list.")
else:
for rights in json_record["rightsList"]:
if not isinstance(rights, dict) or "rights" not in rights:
errors.append(
"Each entry in 'rightsList' must be a dictionary with a 'rights' key."
)

# Check for 'geoLocations'
if "geoLocations" in json_record:
if not isinstance(json_record["geoLocations"], list):
errors.append("'geoLocations' should be a list.")
else:
for location in json_record["geoLocations"]:
if not isinstance(location, dict):
errors.append("Each entry in 'geoLocations' must be a dictionary.")
elif (
"geoLocationPoint" not in location
and "geoLocationBox" not in location
and "geoLocationPlace" not in location
):
errors.append(
"Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'."
)

# Check for 'fundingReferences'
if "fundingReferences" in json_record:
if not isinstance(json_record["fundingReferences"], list):
errors.append("'fundingReferences' should be a list.")
else:
for funding in json_record["fundingReferences"]:
if not isinstance(funding, dict):
errors.append("Each funding reference must be a dictionary.")
if "funderName" not in funding:
errors.append("Each funding reference must contain 'funderName'.")

# Return errors if any are found
if errors:
raise ValueError(f"Validation errors in metadata: {', '.join(errors)}")


if __name__ == "__main__":
# Read in from file for demo purposes

Expand Down

0 comments on commit 9159d93

Please sign in to comment.