Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update cli.py #41

Merged
merged 11 commits into from
Sep 26, 2024
8 changes: 3 additions & 5 deletions caltechdata_api/caltechdata_write.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import copy
import json
import os, requests

import os
import requests
import s3fs
from requests import session
from json.decoder import JSONDecodeError
Expand Down Expand Up @@ -49,8 +49,6 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal
infile = open(name, "rb")
else:
infile = open(f_list[name], "rb")
# size = infile.seek(0, 2)
# infile.seek(0, 0) # reset at beginning
result = requests.put(link, headers=f_headers, data=infile)
if result.status_code != 200:
raise Exception(result.text)
Expand All @@ -68,7 +66,7 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal
def add_file_links(
metadata, file_links, file_descriptions=[], additional_descriptions="", s3_link=None
):
# Currently configured for S3 links, assuming all are at same endpoint
# Currently configured for S3 links, assuming all are at the same endpoint
link_string = ""
endpoint = "https://" + file_links[0].split("/")[2]
s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint})
Expand Down
67 changes: 51 additions & 16 deletions caltechdata_api/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def decrypt_token(encrypted_token, key):
return f.decrypt(encrypted_token).decode()


# Function to get or set token
# Function to get or set token with support for test system
def get_or_set_token(production=True):
key = load_or_generate_key()

Expand Down Expand Up @@ -411,6 +411,7 @@ def main():

def create_record(production):
token = get_or_set_token(production)
# keep_file = input("Do you want to keep your existing files? (yes/no): ").lower() == "yes"
tmorrell marked this conversation as resolved.
Show resolved Hide resolved
print("Using CaltechDATA token:", token)
while True:
choice = get_user_input(
Expand Down Expand Up @@ -521,13 +522,10 @@ def print_upload_message(rec_id, production):
else "https://data.caltechlibrary.dev/uploads/"
)
print(
f"""
You can view and publish this record at

f"""You can view and publish this record at
{base_url}{rec_id}

If you need to upload large files to S3, you can type `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`
"""
If you need to upload large files to S3, you can type
`s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`"""
)


Expand All @@ -552,7 +550,6 @@ def edit_record(production):
print(f"An error occurred during metadata editing: {e}")
else:
print("No metadata file found.")

choice = get_user_input("Do you want to add files? (y/n): ").lower()
if choice == "y":
if production:
Expand All @@ -571,19 +568,44 @@ def edit_record(production):
url = API_URL_TEMPLATE.format(record_id=record_id)
url_draft = API_URL_TEMPLATE_DRAFT.format(record_id=record_id)

response = requests.get(url)
response_draft = requests.get(url_draft)
headers = {
"accept": "application/json",
}

filepath, file_link = upload_supporting_file(record_id)
print(file_link)
if token:
headers["Authorization"] = "Bearer %s" % token

response = requests.get(url, headers=headers)
response_draft = requests.get(url_draft, headers=headers)

if response.status_code == 404 and response_draft.status_code == 404:
# print(production, response, response_draft)
tmorrell marked this conversation as resolved.
Show resolved Hide resolved
# print(response.status_code, response_draft.status_code)

data = response.json()
data_draft = response_draft.json()

# print(data_draft)
# Check if 'entries' exists and its length
if (
len(data.get("entries", [])) == 0
and len(data_draft.get("entries", [])) == 0
):
keepfile = False
else:
keepfile = (
input("Do you want to keep existing files? (y/n): ").lower() == "y"
)

# if response.status_code == 404 and response_draft.status_code == 404:
# keepfile = False
# else:

# keepfile = input("Do you want to keep existing files? (y/n): ").lower() == "y"

filepath, file_link = upload_supporting_file(record_id)
if file_link:
print(file_link)

if filepath != "":
response = caltechdata_edit(
record_id,
Expand All @@ -601,7 +623,7 @@ def edit_record(production):
file_links=file_link,
production=production,
publish=False,
keepfile=keepfile,
keepfiles=keepfile,
)

rec_id = response
Expand All @@ -620,15 +642,28 @@ def download_file_by_id(record_id, token=None):

try:
response = requests.get(url, headers=headers)

if response.status_code != 200:
# Might have a draft
response = requests.get(
url + "/draft",
headers=headers,
)
if response.status_code != 200:
raise Exception(f"Record {record_id} does not exist, cannot edit")
url = f"https://data.caltechlibrary.dev/api/records/{record_id}"
response = requests.get(
url,
headers=headers,
)
if response.status_code != 200:
# Might have a draft
response = requests.get(
url + "/draft",
headers=headers,
)
if response.status_code != 200:
raise Exception(
f"Record {record_id} does not exist, cannot edit"
)
file_content = response.content
file_name = f"downloaded_data_{record_id}.json"
with open(file_name, "wb") as file:
Expand Down
166 changes: 165 additions & 1 deletion caltechdata_api/customize_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ def rdm_creators_contributors(person_list, peopleroles):

def customize_schema_rdm(json_record):
# Get vocabularies used in InvenioRDM
vocabularies = get_vocabularies()

vocabularies = get_vocabularies()
validate_metadata(json_record)
peopleroles = vocabularies["crr"]
resourcetypes = vocabularies["rsrct"]
descriptiontypes = vocabularies["dty"]
Expand Down Expand Up @@ -386,6 +387,169 @@ def customize_schema_rdm(json_record):
return final


def validate_metadata(json_record):
"""
Validates the presence and structure of required fields in a CaltechDATA JSON record.
Raises an exception if any required field is missing or structured incorrectly.
"""
errors = []

# Check for 'types' and 'resourceTypeGeneral'
if "types" not in json_record:
errors.append("'types' field is missing.")
elif not isinstance(json_record["types"], dict):
errors.append("'types' field should be a dictionary.")
elif "resourceTypeGeneral" not in json_record["types"]:
errors.append("'resourceTypeGeneral' field is missing in 'types'.")

# Check for 'title'
if "titles" not in json_record:
errors.append("'titles' field is missing.")
elif not isinstance(json_record["titles"], list) or len(json_record["titles"]) == 0:
errors.append("'titles' should be a non-empty list.")
else:
# Ensure each title is a dictionary with 'title' field
for title in json_record["titles"]:
if not isinstance(title, dict) or "title" not in title:
errors.append(
"Each entry in 'titles' must be a dictionary with a 'title' key."
)

# Check for 'publication_date'
if "publicationYear" not in json_record and "dates" not in json_record:
errors.append(
"A publication date is required ('publicationYear' or 'dates' field is missing)."
)
if "dates" in json_record:
if not isinstance(json_record["dates"], list):
errors.append("'dates' should be a list.")
else:
for date_entry in json_record["dates"]:
if (
not isinstance(date_entry, dict)
or "dateType" not in date_entry
or "date" not in date_entry
):
errors.append(
"Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys."
)

# Check for 'creators'
if "creators" not in json_record:
errors.append("'creators' field is missing.")
elif (
not isinstance(json_record["creators"], list)
or len(json_record["creators"]) == 0
):
errors.append("'creators' should be a non-empty list.")
else:
for creator in json_record["creators"]:
if not isinstance(creator, dict) or "name" not in creator:
errors.append(
"Each creator in 'creators' must be a dictionary with a 'name' key."
)

# Check for 'contributors'
if "contributors" in json_record:
if not isinstance(json_record["contributors"], list):
errors.append("'contributors' should be a list.")
else:
for contributor in json_record["contributors"]:
if not isinstance(contributor, dict) or "name" not in contributor:
errors.append(
"Each contributor must be a dictionary with a 'name' key."
)

# Check for 'resourceType'
if "resourceType" not in json_record["types"]:
errors.append("'resourceType' field is missing in 'types'.")
elif not isinstance(json_record["types"]["resourceType"], str):
errors.append("'resourceType' should be a string.")

# Check for 'identifiers'
if "identifiers" in json_record:
if not isinstance(json_record["identifiers"], list):
errors.append("'identifiers' should be a list.")
else:
for identifier in json_record["identifiers"]:
if (
not isinstance(identifier, dict)
or "identifier" not in identifier
or "identifierType" not in identifier
):
errors.append(
"Each identifier must be a dictionary with 'identifier' and 'identifierType' keys."
)

# Check for 'subjects'
if "subjects" in json_record:
if not isinstance(json_record["subjects"], list):
errors.append("'subjects' should be a list.")
else:
for subject in json_record["subjects"]:
if not isinstance(subject, dict) or "subject" not in subject:
errors.append(
"Each subject must be a dictionary with a 'subject' key."
)

# Check for 'relatedIdentifiers'
if "relatedIdentifiers" in json_record:
if not isinstance(json_record["relatedIdentifiers"], list):
errors.append("'relatedIdentifiers' should be a list.")
else:
for related_id in json_record["relatedIdentifiers"]:
if (
not isinstance(related_id, dict)
or "relatedIdentifier" not in related_id
):
errors.append(
"Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key."
)

# Check for 'rightsList'
if "rightsList" in json_record:
if not isinstance(json_record["rightsList"], list):
errors.append("'rightsList' should be a list.")
else:
for rights in json_record["rightsList"]:
if not isinstance(rights, dict) or "rights" not in rights:
errors.append(
"Each entry in 'rightsList' must be a dictionary with a 'rights' key."
)

# Check for 'geoLocations'
if "geoLocations" in json_record:
if not isinstance(json_record["geoLocations"], list):
errors.append("'geoLocations' should be a list.")
else:
for location in json_record["geoLocations"]:
if not isinstance(location, dict):
errors.append("Each entry in 'geoLocations' must be a dictionary.")
elif (
"geoLocationPoint" not in location
and "geoLocationBox" not in location
and "geoLocationPlace" not in location
):
errors.append(
"Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'."
)

# Check for 'fundingReferences'
if "fundingReferences" in json_record:
if not isinstance(json_record["fundingReferences"], list):
errors.append("'fundingReferences' should be a list.")
else:
for funding in json_record["fundingReferences"]:
if not isinstance(funding, dict):
errors.append("Each funding reference must be a dictionary.")
if "funderName" not in funding:
errors.append("Each funding reference must contain 'funderName'.")

# Return errors if any are found
if errors:
raise ValueError(f"Validation errors in metadata: {', '.join(errors)}")


if __name__ == "__main__":
# Read in from file for demo purposes

Expand Down