From 6c5d8b460cfe5719a3e8331136e1aaf4c317841a Mon Sep 17 00:00:00 2001 From: Zhang Date: Tue, 25 Jun 2024 12:57:48 -0400 Subject: [PATCH 1/7] initial object dataCheck in islandora --- README.md | 3 + islandoraObjCheck.py | 159 +++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 5 ++ 3 files changed, 167 insertions(+) create mode 100644 islandoraObjCheck.py create mode 100644 requirements.txt diff --git a/README.md b/README.md index 1e36e75..e67b469 100644 --- a/README.md +++ b/README.md @@ -17,3 +17,6 @@ A script to examine a download of a Process (Ingest Card) from the Preserica Mon This command takes a single argument of the path to the downloaded file. The script will add (or replace) the element of `islandora:preservicaRef` to the RELS-EXT with the value of the Preservica Ref identifier. + +## islandoraObjectCheck.py +The file is to intake a pidlist file and search through islandora objects via solr admin, and process the response from search tooutput a csv file containing the pid from intake and the child page items assoicated with. diff --git a/islandoraObjCheck.py b/islandoraObjCheck.py new file mode 100644 index 0000000..e4b8c60 --- /dev/null +++ b/islandoraObjCheck.py @@ -0,0 +1,159 @@ +##Function: pageCount_of_Pid +## readin a pidlist file containing pitt identifiers and process through +## islandora object api request to compute the number of the pids' child +## objects with the filter 'RELS_EXT_isPageOf_uri_s' on objects' metadata +## Created 06/17/2024 +## @author: Ruiling Z. +## @params: file-pids.csv, test-solr-object-3.xml +## @result: mscount.csv + +import requests, json, os +import re +import csv +from collections import defaultdict + +#locate xml data +from xml.etree import ElementTree as etree + +f_path = os.path.dirname(os.path.realpath(__file__)) +''' +1) api to get islandora object via filter of pageOf attributes +2) dump the objectIDs(PID) with the counts of children records associated to the PIDs + +3) preservica report api, similarly get compoundObjectIDs(conceptual objs) counts, and + count the associated children members baseon the concptual objectID +4) validate both parent level and with the kids level + +''' +f_pids = "file-pids.csv" +mscount_file="mscount.csv" +def get_islandoraData(s_query): + #pid format convention + str_q = "PID:pitt\\" + s_query[4:] + try: + #step1). retrieve object from islandora api request + url ='https://gamera.library.pitt.edu/solr/uls_digital_core/select' + payload = {"q": str_q, + "fl":"PID,RELS_EXT_isPageOf_uri_s,RELS_EXT_hasModel_uri_ms", + "sort":"PID asc", + "wt":"json"} + + responses = requests.get(url, params=payload) + if (responses.status_code ==200) : + # store json data rest api response + json_data = (responses.json()) + results = json_data['response'] + #print (json.dumps(results['docs'], indent=4)) + return (results) + except requests.exceptions.HTTPError as e: + print("Error: " + str(e)) +#define a dict value with a value of list holding ms_count, and ms_items +ms_items = defaultdict(list) + +# step2) Helper function used to compute the multpart objects with the relation mapping of RELS_EXT_isPageOf_uri_s +# to the pid from the solr api response +def get_multipart_count(objID): + results = get_islandoraData(objID) + assert isinstance(results, dict) #make sure the response data is a dict + for data in results['docs']: + tmpPagelst ={} + tempParentID ="" + #pass the objID to solr to retrieve data from islandora + if ((data["PID"]) == objID and "RELS_EXT_isPageOf_uri_s" in data): + + # extract the parentPID from the pageOf + tempParentID = data["RELS_EXT_isPageOf_uri_s"].rpartition("/")[2] + #print("Check2).", objID, " ", data["RELS_EXT_isPageOf_uri_s"], "parentObj: " ,tempParentID) + + #first time associate page item to parent + if not (tempParentID in ms_items.keys()): + tmpPagelst['pageIds']=[] + tmpPagelst['pageIds'].append(objID) + tmpPagelst['counter'] = 1 + ms_items[tempParentID]=tmpPagelst + print("New key added in ms_items: ", ms_items[tempParentID]) + + else: + #update the value for the key matching tempParentID + v= [v for k,v in ms_items.items() if k == tempParentID] + v[0]["counter"] += 1 + v[0]["pageIds"].append(objID) + #print("after update: ", ms_items[tempParentID]) + return ms_items + +#Main Function: takes in csv file containing object IDs, and iterate each ID to check on islandora via solr search +#it outputs a csv file with the number of pageOf items associated with each PID +def pageCount_of_Pid (inFile_pids): + #open file to read the pids + with open (os.path.join(f_path, inFile_pids), 'r') as pid_f: + pidreader = csv.DictReader(pid_f) + + #step3). write output file + with open(os.path.join(f_path, mscount_file), 'w', newline='') as match_f: + header_lst = ['PID', 'num_isPageOf_uri_s', 'pageitems'] + f_writer = csv.DictWriter(match_f, fieldnames=header_lst) + f_writer.writeheader() + #now interate each objs from response + for row in pidreader: + item = row['pitt-pid'] + #print("Check1) passing pid: ", item) + mydict = get_multipart_count(item) + if mydict: + print("Check: final ms_item before writing in file " ,json.dumps(mydict, indent =4)) + for k,v in mydict.items(): + f_writer.writerow({header_lst[0]:k, header_lst[1]:v['counter'],header_lst[2]:v['pageIds']}) + +pageCount_of_Pid(f_pids) + +""" +##testing draft for the version parsing xml: readin a pidlist file containing pitt identifiers and process through +## the object api request response from islandora to compute the number of the pids' child objects with +## the filter 'RELS_EXT_isPageOf_uri_s' +## Created 06/13/2024 +## @params: file-pids.csv, test-solr-object-3.xml +## @result: mscount.csv +""" +xmlfile ="test-solr-object-3.xml" +tmptree=etree.parse(os.path.join(f_path, xmlfile)) +#print(tmptree.getroot().tag) #response +rootnode =tmptree.getroot() +#print(etree.tostring(rootnode, encoding='utf8').decode('utf8')) + + +#step2). read a pids input file +#iterate name to locate pid +pid_lst =['pitt:31735070061167','pitt:31735029251976'] + +def checkObjMs(id_lst): + ms_dict =dict(); + #open file to read the pids + with open(os.path.join(f_path, f_pids), 'r') as pid_f: + pidreader = csv.DictReader(pid_f) + + #step3). write output file + with open(os.path.join(f_path, mscount_file), 'w', newline='') as match_f: + header_lst = ['PID', 'num_isPageOf_uri_s'] + f_writer = csv.DictWriter(match_f, fieldnames=header_lst) + f_writer.writeheader() + + #now loop the result to interate each records matching the creteria + for row in pidreader: + item = row['pitt-pid'] + # find the match pid under + counter =0 + for em in tmptree.findall(".//*[@name='PID']"): + if (em.text == item): + print("element: ", em.attrib, "value: ", em.text) + for ms in tmptree.findall(".//*[@name='RELS_EXT_isPageOf_uri_s']"): + #if ms.text.find(em.text) != -1: //find match + found = re.search(r'[a-zA-Z]*(pitt)\:{}$'.format(em.text.split(':')[-1]), ms.text) + if(found): + counter +=1 #have child elements, then find the pid + if counter >0: + print("ms element: " + em.text + " child element: " + str(counter) + "\n" ) + else: + print("No child element found in ms element: " + em.text +"\n" ) + f_writer.writerow({header_lst[0]:em.text.split(":")[-1], header_lst[1]:counter}) +#testcall -PASS +#checkObjMs(pid_lst) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fd8654b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +certifi==2024.6.2 +charset-normalizer==3.3.2 +idna==3.7 +requests==2.32.3 +urllib3==2.2.1 From 83a949f9a4ba356136b8eadf805b188eba2fb13f Mon Sep 17 00:00:00 2001 From: Zhang Date: Tue, 9 Jul 2024 18:15:40 -0400 Subject: [PATCH 2/7] compare the pageOf counts of islandora objects to the bitstreams count of the corresponding preservica object --- .gitignore | 7 ++ README.md | 12 ++- input/file-pids-sample.csv | 11 +++ islandoraObjCheck.py | 176 +++++++++++---------------------- preservicaCheck.py | 28 ++++++ preservicaObjCapture.py | 196 +++++++++++++++++++++++++++++++++++++ rdfUpdate.py | 112 +++++++++++++++++++++ 7 files changed, 423 insertions(+), 119 deletions(-) create mode 100644 .gitignore create mode 100644 input/file-pids-sample.csv create mode 100644 preservicaCheck.py create mode 100644 preservicaObjCapture.py create mode 100644 rdfUpdate.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1cd5401 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.log +*.txt +*.xml + +#sample testing file +#input/ +#output/ diff --git a/README.md b/README.md index e67b469..b3336d6 100644 --- a/README.md +++ b/README.md @@ -19,4 +19,14 @@ This command takes a single argument of the path to the downloaded file. The script will add (or replace) the element of `islandora:preservicaRef` to the RELS-EXT with the value of the Preservica Ref identifier. ## islandoraObjectCheck.py -The file is to intake a pidlist file and search through islandora objects via solr admin, and process the response from search tooutput a csv file containing the pid from intake and the child page items assoicated with. +The file is to intake a pidlist file and search through islandora objects via solr admin, then process the response and compute the total child pageitems of the object, as well as the preservia reference ID associated to the object + +## preservicaCheck.py +The file is to generate preservica token to access the preservica restful apis + +## preservicaObjCapture.py +The file is to intake islandora's objects' pageOf member counts and compare the bitstreams count from the corresponding preservica objects. The script also execute a drush command to export the rdf for the countMatched objects. + +## rdfUpdate.py +The script is to iterate all the updated rdf files and use drush push back to islandora. + diff --git a/input/file-pids-sample.csv b/input/file-pids-sample.csv new file mode 100644 index 0000000..3a062f9 --- /dev/null +++ b/input/file-pids-sample.csv @@ -0,0 +1,11 @@ +pitt:31735073061008 +pitt:31735073060950 +pitt:31735073060893 +pitt:31735073060927 +pitt:2000.07.008 +pitt:2000.07.014 +pitt:2000.07.009 +pitt:2000.07.010 +pitt:2000.07.011 +pitt:2000.07.063 +pitt:1935e49702 diff --git a/islandoraObjCheck.py b/islandoraObjCheck.py index e4b8c60..9ccbeac 100644 --- a/islandoraObjCheck.py +++ b/islandoraObjCheck.py @@ -2,158 +2,98 @@ ## readin a pidlist file containing pitt identifiers and process through ## islandora object api request to compute the number of the pids' child ## objects with the filter 'RELS_EXT_isPageOf_uri_s' on objects' metadata -## Created 06/17/2024 -## @author: Ruiling Z. -## @params: file-pids.csv, test-solr-object-3.xml -## @result: mscount.csv +## @params: file_pids +## @result: file_PgCount import requests, json, os -import re import csv from collections import defaultdict -#locate xml data -from xml.etree import ElementTree as etree - f_path = os.path.dirname(os.path.realpath(__file__)) -''' -1) api to get islandora object via filter of pageOf attributes -2) dump the objectIDs(PID) with the counts of children records associated to the PIDs - -3) preservica report api, similarly get compoundObjectIDs(conceptual objs) counts, and - count the associated children members baseon the concptual objectID -4) validate both parent level and with the kids level +file_pids = "./input/file-pids.csv" #could get from drush +file_pgCount="./output/membercount.csv" -''' -f_pids = "file-pids.csv" -mscount_file="mscount.csv" +#retrieve Object and its pageOf members from islandora def get_islandoraData(s_query): #pid format convention - str_q = "PID:pitt\\" + s_query[4:] + q_par = "PID:pitt\\" + s_query[4:] + q_pages = "RELS_EXT_isPageOf_uri_s: info\\:fedora\\/pitt\\" + s_query[4:] + " OR " + q_par try: #step1). retrieve object from islandora api request url ='https://gamera.library.pitt.edu/solr/uls_digital_core/select' - payload = {"q": str_q, - "fl":"PID,RELS_EXT_isPageOf_uri_s,RELS_EXT_hasModel_uri_ms", - "sort":"PID asc", + payload = {"q": q_pages, + "fl":"PID,RELS_EXT_isPageOf_uri_s,RELS_EXT_hasModel_uri_ms,RELS_EXT_preservicaRef_literal_s", + "sort":"PID asc", + "rows":"100000", "wt":"json"} responses = requests.get(url, params=payload) if (responses.status_code ==200) : - # store json data rest api response json_data = (responses.json()) results = json_data['response'] - #print (json.dumps(results['docs'], indent=4)) + #print(json.dumps(ms_items, indent=4)) return (results) except requests.exceptions.HTTPError as e: print("Error: " + str(e)) -#define a dict value with a value of list holding ms_count, and ms_items + +#define a dict value with a value of list holding islandora object and its pageOf count ms_items = defaultdict(list) -# step2) Helper function used to compute the multpart objects with the relation mapping of RELS_EXT_isPageOf_uri_s -# to the pid from the solr api response +# Helper function to compute the multpart objects via the relation mapping +# 'RELS_EXT_isPageOf_uri_s' to the Object PID from solr api response def get_multipart_count(objID): results = get_islandoraData(objID) - assert isinstance(results, dict) #make sure the response data is a dict + numOfpages = results['numFound'] + s_preservicaRef ="" + #make sure the response data is a dict + assert isinstance(results, dict) + for data in results['docs']: - tmpPagelst ={} - tempParentID ="" - #pass the objID to solr to retrieve data from islandora - if ((data["PID"]) == objID and "RELS_EXT_isPageOf_uri_s" in data): - - # extract the parentPID from the pageOf - tempParentID = data["RELS_EXT_isPageOf_uri_s"].rpartition("/")[2] - #print("Check2).", objID, " ", data["RELS_EXT_isPageOf_uri_s"], "parentObj: " ,tempParentID) - - #first time associate page item to parent - if not (tempParentID in ms_items.keys()): - tmpPagelst['pageIds']=[] - tmpPagelst['pageIds'].append(objID) + tmpPagelst = defaultdict(list) + #capture the preservica reference ID associated to the ObjectID, if existing + if ( "RELS_EXT_preservicaRef_literal_s" in data): + s_preservicaRef = data["RELS_EXT_preservicaRef_literal_s"] + numOfpages -=1 #exclude parent Object + + #pass objID to solr to retrieve childcontent from islandora + if ("RELS_EXT_isPageOf_uri_s" in data): + #retrieve parent object associated + uri_obj = data["RELS_EXT_isPageOf_uri_s"].split("/")[-1] + if not ( uri_obj in ms_items.keys()): tmpPagelst['counter'] = 1 - ms_items[tempParentID]=tmpPagelst - print("New key added in ms_items: ", ms_items[tempParentID]) - + ms_items[uri_obj]=tmpPagelst else: - #update the value for the key matching tempParentID - v= [v for k,v in ms_items.items() if k == tempParentID] + #update the value for the key matching object ID + v= [v for k,v in ms_items.items() if k == uri_obj] v[0]["counter"] += 1 - v[0]["pageIds"].append(objID) - #print("after update: ", ms_items[tempParentID]) + + #export the associated preservica reference ID if existing + if (s_preservicaRef): + val = [val for keyId, val in ms_items.items() if keyId==objID] + val[0]['preservica_RefID'] = s_preservicaRef + return ms_items -#Main Function: takes in csv file containing object IDs, and iterate each ID to check on islandora via solr search -#it outputs a csv file with the number of pageOf items associated with each PID +# Main Function: takes in PIDfile in the format {PID}. It iterates pids to check on islandora via +# solr search, and outputs a csv file containing total# of the Object's pageOf items from islandora, and +# preservica referenceID associated to the pid, if exising def pageCount_of_Pid (inFile_pids): - #open file to read the pids with open (os.path.join(f_path, inFile_pids), 'r') as pid_f: - pidreader = csv.DictReader(pid_f) - + pidreader = csv.reader(pid_f) + #step3). write output file - with open(os.path.join(f_path, mscount_file), 'w', newline='') as match_f: - header_lst = ['PID', 'num_isPageOf_uri_s', 'pageitems'] - f_writer = csv.DictWriter(match_f, fieldnames=header_lst) - f_writer.writeheader() - #now interate each objs from response + with open(os.path.join(f_path, file_pgCount), 'w', newline='') as match_f: + header_lst = ['PID', 'num_isPageOf_uri_s', 'preservica_refID'] + f_writer = csv.writer(match_f, delimiter=',') + f_writer.writerow(header_lst) + #now iterate each objs from response for row in pidreader: - item = row['pitt-pid'] - #print("Check1) passing pid: ", item) - mydict = get_multipart_count(item) + mydict = get_multipart_count(row[0]) + if mydict: - print("Check: final ms_item before writing in file " ,json.dumps(mydict, indent =4)) for k,v in mydict.items(): - f_writer.writerow({header_lst[0]:k, header_lst[1]:v['counter'],header_lst[2]:v['pageIds']}) - -pageCount_of_Pid(f_pids) - -""" -##testing draft for the version parsing xml: readin a pidlist file containing pitt identifiers and process through -## the object api request response from islandora to compute the number of the pids' child objects with -## the filter 'RELS_EXT_isPageOf_uri_s' -## Created 06/13/2024 -## @params: file-pids.csv, test-solr-object-3.xml -## @result: mscount.csv -""" -xmlfile ="test-solr-object-3.xml" -tmptree=etree.parse(os.path.join(f_path, xmlfile)) -#print(tmptree.getroot().tag) #response -rootnode =tmptree.getroot() -#print(etree.tostring(rootnode, encoding='utf8').decode('utf8')) - - -#step2). read a pids input file -#iterate name to locate pid -pid_lst =['pitt:31735070061167','pitt:31735029251976'] - -def checkObjMs(id_lst): - ms_dict =dict(); - #open file to read the pids - with open(os.path.join(f_path, f_pids), 'r') as pid_f: - pidreader = csv.DictReader(pid_f) - - #step3). write output file - with open(os.path.join(f_path, mscount_file), 'w', newline='') as match_f: - header_lst = ['PID', 'num_isPageOf_uri_s'] - f_writer = csv.DictWriter(match_f, fieldnames=header_lst) - f_writer.writeheader() - - #now loop the result to interate each records matching the creteria - for row in pidreader: - item = row['pitt-pid'] - # find the match pid under - counter =0 - for em in tmptree.findall(".//*[@name='PID']"): - if (em.text == item): - print("element: ", em.attrib, "value: ", em.text) - for ms in tmptree.findall(".//*[@name='RELS_EXT_isPageOf_uri_s']"): - #if ms.text.find(em.text) != -1: //find match - found = re.search(r'[a-zA-Z]*(pitt)\:{}$'.format(em.text.split(':')[-1]), ms.text) - if(found): - counter +=1 #have child elements, then find the pid - if counter >0: - print("ms element: " + em.text + " child element: " + str(counter) + "\n" ) - else: - print("No child element found in ms element: " + em.text +"\n" ) - f_writer.writerow({header_lst[0]:em.text.split(":")[-1], header_lst[1]:counter}) -#testcall -PASS -#checkObjMs(pid_lst) - + f_writer.writerow([k, v['counter'], v['preservica_RefID']]) + +#Driver +if __name__ == "__main__": + pageCount_of_Pid(file_pids) \ No newline at end of file diff --git a/preservicaCheck.py b/preservicaCheck.py new file mode 100644 index 0000000..45a0026 --- /dev/null +++ b/preservicaCheck.py @@ -0,0 +1,28 @@ +import requests +import time + +#generate token to access preservica restful api +sUrl="https://pitt.preservica.com/api/accesstoken/login" +testuser ="testuser@pitt.edu" +testpw="testpassword" +data ={"username": testuser, + "password": testpw + } +headers = {"Contnent-Type": "application/x-www-form-urlencoded"} +def generateToken(): + r = requests.post (sUrl, data=data, headers=headers) + if r.status_code != 200: + print("Error:" , r.status_code) + exit(1) + else: + return [r.json()['token'], r.json()['refresh-token']] + + +def getRefreshToken(s): + sRefreshUrl ="https://pitt.preservica.com/api/accesstoken/refresh?refreshToken=" + s[1] + newheaders = {"Preservica-Access-Token" : s[0], + "Contnent-Type": "application/x-www-form-urlencoded"} + + res = requests.post (sRefreshUrl, headers=newheaders) + if res.status_code == 200: + return [res.json()['token'], res.json()['refresh-token']] diff --git a/preservicaObjCapture.py b/preservicaObjCapture.py new file mode 100644 index 0000000..46ff308 --- /dev/null +++ b/preservicaObjCapture.py @@ -0,0 +1,196 @@ +########################################################### +##Function: preservicaObjCapture.py +## process an inputFile containing Islandora Objects'pid,numberOfPage of the Object to valid +## the associated preservica objects by checking the count of the bitstreams for each preservica +## object matching the numberOfPage count from the correspondent islandora object +## @params: islandorapids.csv : the output file from islandoraObjCheck.py +## @result: valid_result.csv +########################################################### +import requests,json,csv +from xml.etree import ElementTree as etree +from collections import defaultdict +import sys, os, getopt, array +import subprocess +import time + +import preservicaCheck as token_fn +f_path = os.path.dirname(os.path.realpath(__file__)) +islandora_count_f = "./output/membercount.csv" +valid_result_f = "./output/valid_result.csv" + +tmpToken ="c5b43831-9c2f-4c96-903d-be53b86be835" +headers = { + 'Preservica-Access-Token': tmpToken +} + +curr_session=[] + +sInfoObj_baseUrl = "https://pitt.preservica.com/api/entity/information-objects/" +sContentObj_baseUrl = "https://pitt.preservica.com/api/entity/content-objects/" + + +InfoObjdata = defaultdict(list) +def getObjInfo(apiUrl): # -------------PASS--------------# + InfoObjdata = defaultdict(list) + try: + responses = requests.get(apiUrl, headers=headers) + responses.raise_for_status() + xml_response = str(responses.content.decode('UTF-8')) + + #process the xml + entity_response = etree.fromstring(xml_response) + reference = entity_response.find('.//{http://preservica.com/XIP/v7.2}Ref') + identifier = entity_response.find('.//{http://preservica.com/EntityAPI/v7.2}Identifiers') + representation = entity_response.find('.//{http://preservica.com/EntityAPI/v7.2}Representations') + tmpObjInfo = {} + tmpObjInfo["PIDInfo"] = identifier.text + tmpObjInfo["representationInfo"] = representation.text + InfoObjdata[reference.text] = tmpObjInfo + + return InfoObjdata + except requests.exceptions.RequestException as e: + print("Error: ", e) + +#capture all ContentObjects from Representations : dict {objectId: contentids} +def getContentObjID(sRep_Url): + r = requests.get(sRep_Url, headers=headers) + counter =0 + contentobjdata ={} + # store json data rest api + if (r.status_code == 200): + xml_resRepresentation = str(r.content.decode('UTF-8')) + #process the xml + res_content_response = etree.fromstring(xml_resRepresentation) + infoObjID = res_content_response.find('.//{http://preservica.com/XIP/v7.2}InformationObject') + contentobjs = res_content_response.findall('.//{http://preservica.com/XIP/v7.2}ContentObjects/{http://preservica.com/XIP/v7.2}ContentObject') + tempcontentid =[] + + for contentobj in contentobjs: + #capture all content object ids for the information object + tempcontentid.append(contentobj.text) + contentobjdata[infoObjID.text] = tempcontentid + return contentobjdata + +#make a generic call to retrieve object data from preservica restapi +def getcontenobjInfo(sObjbaseUrl, sobjitem="", sParam=""): + try: + r = (sobjitem and sParam) and requests.get(f'{sObjbaseUrl}{sParam}/{sobjitem}', headers=headers) or requests.get(f'{sObjbaseUrl}', headers=headers) + r.raise_for_status() + xml_responses = str(r.content.decode('UTF-8')) + res_tree_response = etree.fromstring(xml_responses) + return res_tree_response + except requests.exceptions.RequestException as e: + print("Error: ", e) + +#capture the generations of contentobject id +def getbitstreamInfo(sContentGen, sRefId): + res_gen = getcontenobjInfo(sContentObj_baseUrl, sContentGen , sRefId) + genLst = res_gen.findall('.//{http://preservica.com/EntityAPI/v7.2}Generations/{http://preservica.com/EntityAPI/v7.2}Generation[@active ="true"]') + + contBitstream = defaultdict(list) + total = 0 + #iterate generation to get the bitstream count + for ele in genLst: + bitstreamLst = getcontenobjInfo(ele.text).findall('.//{http://preservica.com/XIP/v7.2}Bitstreams/{http://preservica.com/XIP/v7.2}Bitstream') + total += len(bitstreamLst) + contBitstream[sRefId] = total + return contBitstream + +#retrieves all representations of InformationObj and compute total bitstreams underneath contentObjs for the representation +def getRepresentionInfo(sUrl, sRef_ID): + stempUrl = sUrl + sRef_ID + testInfoObj = getObjInfo(stempUrl) + if len(testInfoObj) > 0: + try: + sRepUrl = next(iter(testInfoObj.values()))['representationInfo'] + req_Rep = requests.get(sRepUrl, headers=headers) + req_Rep.raise_for_status() + xml_reqRep = str(req_Rep.content.decode('UTF-8')) + representation_rep = etree.ElementTree(etree.fromstring(xml_reqRep)) + representations = representation_rep.findall( + './/{http://preservica.com/EntityAPI/v7.2}Representations/{http://preservica.com/EntityAPI/v7.2}Representation' + ) + contentObj_lst ={} + BitstreamCount =0 + for representation in representations: + #print(representation.text, " ", representation.attrib) + + #get all content object IDs for each of the represention + listOfRepresentContent = getContentObjID(representation.text) + + if len(listOfRepresentContent) > 0 : #get value of the first key since only one element + #iterate each contentObj to get its generations for computing bitstreams + for co in listOfRepresentContent.values(): + if len(co) > 0: + for i in co: + BitstreamCount += getbitstreamInfo("generations", i)[i] + #print("Total bitstream for :" ,next(iter(listOfRepresentContent)), "is ", BitstreamCount) + + if not (next(iter(listOfRepresentContent)) in contentObj_lst.keys()): + contentObj_lst[next(iter(listOfRepresentContent))] = BitstreamCount + else: + contentObj_lst[next(iter(listOfRepresentContent))] += BitstreamCount + #print(contentObj_lst) #this should be the total bitstreams under the InformationObject + return (contentObj_lst) + except requests.exceptions.RequestException as e: + print("Error: ", e) + +#match the count, if match update the xml, otherwise output log error +def preservica_bitstream_valid (f_in): + #open file to read the output file from islandoraObjcheck eg. mscount.csv) + with open (os.path.join(f_path, f_in), 'r', newline='') as islandoraCount_f: + csvreader = csv.DictReader(islandoraCount_f) + + with open(os.path.join(f_path, valid_result_f), 'w', newline='') as result_f: + header_lst = ['PID', 'islandora_count', 'preservica_refID', 'bitstreamCount'] + f_writer = csv.DictWriter(result_f, fieldnames=header_lst) + f_writer.writeheader() + #now iterate each objs from response + start_time =time.time() + for row in csvreader: + bitstream_dict ={} + bitstream_dict = getRepresentionInfo(sInfoObj_baseUrl, row['preservica_refID']) + if bitstream_dict: + for k,v in bitstream_dict.items(): + if v == int(row['num_isPageOf_uri_s']): + f_writer.writerow({header_lst[0]:row['PID'], header_lst[1]:row['num_isPageOf_uri_s'], + header_lst[2]:k, header_lst[3]:v}) + else: + f_writer.writerow({header_lst[0]:row['PID'], header_lst[1]:row['num_isPageOf_uri_s'], + header_lst[2]:k, header_lst[3]:""}) + + print(f"usage: {(time.time()-start_time)*10**3:.03f} ms") + print(curr_session) + #need regenerate a refresh token + if ((time.time()-start_time) - 600000 >0 ): + new_session = token_fn.getRefreshToken(curr_session) + curr_session[0]= new_session[0] + curr_session[1] =new_session[1] + headers['Preservica-Access-Token']= new_session[0] + print(headers) + + +def drushfetchPids(): + file_name = curr = os.getcwd() +"/output/drush_pid.csv" + user = os.environ['USER'] if os.getenv("USER") is not None else os.environ['USERNAME'] + squery = 'RELS_EXT_preservicaRef_literal_s:* ' + squery += 'AND (RELS_EXT_hasModel_uri_ms:info:fedora/islandora:manuscriptCModel OR RELS_EXT_hasModel_uri_ms:info:fedora/islandora:newspaperIssueCModel OR RELS_EXT_hasModel_uri_ms:info:fedora/islandora:bookCModel) ' + squery += 'AND NOT RELS_EXT_preservicaChildCount_literal_s:*' + try: + s = subprocess.check_call (['drush', '--root=/var/www/html/drupal7/', '--user={}'.format(user), \ + '--uri=http://gamera.library.pitt.edu', 'islandora_datastream_crud_fetch_pids', \ + '--solr_query={}'.format(squery), '--pid_file={}'.format(file_name)]) + + except subprocess.CalledProcessError as e: + print(f"Command failed with return code {e.returncode}") + +if __name__ == "__main__": + #drushfetchPids() + curr_session = token_fn.generateToken() + print("token :" ,curr_session[0], " refresh-token: ", curr_session[1]) + headers['Preservica-Access-Token'] = curr_session[0] + + preservica_bitstream_valid(islandora_count_f) + + + diff --git a/rdfUpdate.py b/rdfUpdate.py new file mode 100644 index 0000000..1334020 --- /dev/null +++ b/rdfUpdate.py @@ -0,0 +1,112 @@ +########################################################## +## The file processes an inputFile containing Object PID with its membercontent count from islandora +## and the bitstreams count for the same object from preservica. It then iterates PIDs with the matched +## membercount and update the corresponding rdf file by adding the count +## value to a new field 'preservicaChildCount'. The last step is to +## drush push the updated rdf files to islandora +## @params: valid_result_f +## @result: update_extfiles/.rdf files +########################################################### +import sys, os, csv, shutil +from xml.etree import ElementTree as etree +from xml.etree.ElementTree import Element, SubElement +import subprocess,fnmatch +from pprint import pprint + +f_path = os.path.dirname(os.path.realpath(__file__)) +curr = os.getcwd() +user = os.environ['USER'] if os.getenv("USER") is not None else os.environ['USERNAME'] + +valid_result_f = "./output/valid_result.csv" +f_output = "./output/temp_pid.csv" +extRel_fpath = "/output/extfiles" +update_fpath = "/output/update_extfiles" + +#1. create a temp pidsFile holding the verified the bitstreams count for the corresponding perservica obj +## result: f_output/temp_pid.csv +def getVerifiedPids(f_pids): + header_fields=["PID"] + with open (os.path.join( f_path, f_pids ), 'r', newline='') as pids_f: + csvreader = csv.DictReader(pids_f) + with open(os.path.join(f_path, f_output), 'w', newline='') as temp_f: + csvwriter = csv.DictWriter(temp_f, fieldnames=header_fields) + for r in csvreader: + if (r["bitstreamCount"]): # verified the match + csvwriter.writerow({header_fields[0]:r["PID"]}) + +#2.Iterate the intake pids and extract the ext-rel file via drush from islandora +# and export the original rdf files to the designed output directory +def drushfetchDatastream(): + pidtest_name = os.path.join(f_path, f_output) + try: + subprocess.check_call (['drush', '--root=/var/www/html/drupal7/', '--user={}'.format(user), \ + '--uri=http://gamera.library.pitt.edu', 'islandora_datastream_crud_fetch_datastreams', '--dsid=RELS-EXT', \ + '--pid_file={}'.format(pidtest_name), '--datastreams_directory={}{}'.format(curr, extRel_fpath), '--filename_separator=^', '-y']) + + except subprocess.CalledProcessError as e: + print(f"Command failed with return code {e.returncode}") + +#3. helper function to update the xml +def fileProcess(fname, ele_name, ele_val): + #updatefiles in updated_file dir + curr_file = curr + extRel_fpath + "/" + fname + + #register ns to reserve the original prefix + ns_dict =dict([node for _,node in etree.iterparse(curr_file, events=['start-ns'])]) + #pprint(ns_dict) + etree.register_namespace('', 'http://digital.library.pitt.edu/ontology/relations#') + etree.register_namespace('fedora', 'info:fedora/fedora-system:def/relations-external#') + etree.register_namespace('fedora-model', 'info:fedora/fedora-system:def/model#') + etree.register_namespace('islandora', 'http://islandora.ca/ontology/relsext#') + etree.register_namespace('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#') + + curr_tree = etree.parse(curr_file) + curr_root =curr_tree.getroot() + #add new tag "preservicaChildCount" + desc_element = curr_root.find('rdf:Description', ns_dict) + newNode =etree.SubElement(desc_element, etree.QName(ns_dict["islandora"], ele_name)) + newNode.text =ele_val + updated_file = curr + update_fpath + "/" + fname + curr_tree.write( updated_file, "UTF-8") + +#4. process an inputFile generated from preservicaObjCapture and add a new tag +# RELS_EXT_preservicaChildCount_literal_s to pid's rdf +# @param: filename: islandora-preservica-bitstream matched pidfile +# @param: filepath: file dir to host the extl-rel generated from fname pids via drush +# @param: elementName: new element tagName designed to be added + +def updateExtRelFile(fpath, fname, e_name): + with open (os.path.join( f_path, fname ), 'r', newline='') as pf: + csvreader = csv.DictReader(pf) + for r in csvreader: + if (r["bitstreamCount"]): # pid with the verified countmatch + #find the ext file matching the name r['PID'], might use re + tmp_pattern = r['PID'] + "^RELS-EXT.rdf" + for file in os.listdir(curr+fpath): + if fnmatch.fnmatch(file, tmp_pattern): + print(file) + fileProcess(file, e_name, r["bitstreamCount"]) + +#5. push modified .rdf to islandora via drush +def drushpushDatastreams(): + try: + subprocess.check_call (['drush', '--root=/var/www/html/drupal7/', '--user={}'.format(user), \ + '--uri=http://gamera.library.pitt.edu', 'islandora_datastream_crud_push_datastreams', '--no_derivs', \ + '--update_dc=0', '--datastreams_source_directory={}/output/update_extfiles'.format(curr), '--filename_separator=^', '-y']) + + except subprocess.CalledProcessError as e: + print(f"Command failed with return code {e.returncode}") + +if __name__ == "__main__": + getVerifiedPids(valid_result_f) + drushfetchDatastream() + + ##copy all the file in an update_fpath to use for testing purpose + org_files = os.listdir(curr+extRel_fpath) + shutil.copytree(curr+extRel_fpath, curr+update_fpath) + + newTagName = "preservicaChildCount" + updateExtRelFile(update_fpath, valid_result_f,newTagName) + #drushpushDatastreams() + + From 78466a3330ab15a7d033f559e372bbc6e11b1d40 Mon Sep 17 00:00:00 2001 From: Ruiling Zhang Date: Wed, 10 Jul 2024 18:14:05 -0400 Subject: [PATCH 3/7] add the timestamp to track token expiration --- .gitignore | 5 ++-- islandoraObjCheck.py | 27 +++++++++++++++---- preservicaCheck.py | 1 - preservicaObjCapture.py | 59 +++++++++++++++-------------------------- rdfUpdate.py | 16 +++++------ 5 files changed, 55 insertions(+), 53 deletions(-) diff --git a/.gitignore b/.gitignore index 1cd5401..575d68c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ *.log *.txt *.xml +.env #sample testing file -#input/ -#output/ +input/ +output/ diff --git a/islandoraObjCheck.py b/islandoraObjCheck.py index 9ccbeac..212de10 100644 --- a/islandoraObjCheck.py +++ b/islandoraObjCheck.py @@ -7,11 +7,12 @@ import requests, json, os import csv +import subprocess from collections import defaultdict f_path = os.path.dirname(os.path.realpath(__file__)) -file_pids = "./input/file-pids.csv" #could get from drush -file_pgCount="./output/membercount.csv" +file_pids = "./input/file-pids.csv" #intakes pidfile +file_pgCount="./output/membercount.csv" #retrieve Object and its pageOf members from islandora def get_islandoraData(s_query): @@ -70,7 +71,8 @@ def get_multipart_count(objID): #export the associated preservica reference ID if existing if (s_preservicaRef): val = [val for keyId, val in ms_items.items() if keyId==objID] - val[0]['preservica_RefID'] = s_preservicaRef + if val: + val[0]['preservica_RefID'] = s_preservicaRef return ms_items @@ -81,7 +83,7 @@ def pageCount_of_Pid (inFile_pids): with open (os.path.join(f_path, inFile_pids), 'r') as pid_f: pidreader = csv.reader(pid_f) - #step3). write output file + #write output file with open(os.path.join(f_path, file_pgCount), 'w', newline='') as match_f: header_lst = ['PID', 'num_isPageOf_uri_s', 'preservica_refID'] f_writer = csv.writer(match_f, delimiter=',') @@ -94,6 +96,21 @@ def pageCount_of_Pid (inFile_pids): for k,v in mydict.items(): f_writer.writerow([k, v['counter'], v['preservica_RefID']]) -#Driver +def drushfetchPids(): + file_name = os.getcwd() +"/input/file-pids.csv" + user = os.environ['USER'] if os.getenv("USER") is not None else os.environ['USERNAME'] + squery = 'RELS_EXT_preservicaRef_literal_s:* ' + squery += 'AND (RELS_EXT_hasModel_uri_ms:info\:fedora/islandora\:manuscriptCModel OR RELS_EXT_hasModel_uri_ms:info\:fedora/islandora\:newspaperIssueCModel OR RELS_EXT_hasModel_uri_ms:info\:fedora/islandora\:bookCModel)' + squery += 'AND NOT RELS_EXT_preservicaChildCount_literal_s:*' + + try: + s = subprocess.check_call (['drush', '--root=/var/www/html/drupal7/', '--user={}'.format(user), \ + '--uri=http://gamera.library.pitt.edu', 'islandora_datastream_crud_fetch_pids', \ + '--solr_query={}'.format(squery), '--pid_file={}'.format(file_name)]) + + except subprocess.CalledProcessError as e: + print(f"Command failed with return code {e.returncode}") + if __name__ == "__main__": + drushfetchPids() pageCount_of_Pid(file_pids) \ No newline at end of file diff --git a/preservicaCheck.py b/preservicaCheck.py index 45a0026..2852ac8 100644 --- a/preservicaCheck.py +++ b/preservicaCheck.py @@ -1,5 +1,4 @@ import requests -import time #generate token to access preservica restful api sUrl="https://pitt.preservica.com/api/accesstoken/login" diff --git a/preservicaObjCapture.py b/preservicaObjCapture.py index 46ff308..62c8dfa 100644 --- a/preservicaObjCapture.py +++ b/preservicaObjCapture.py @@ -3,24 +3,25 @@ ## process an inputFile containing Islandora Objects'pid,numberOfPage of the Object to valid ## the associated preservica objects by checking the count of the bitstreams for each preservica ## object matching the numberOfPage count from the correspondent islandora object -## @params: islandorapids.csv : the output file from islandoraObjCheck.py +## @params: islandorapids.csv : file generated from islandoraObjCheck process ## @result: valid_result.csv ########################################################### -import requests,json,csv +import requests, json, csv from xml.etree import ElementTree as etree from collections import defaultdict import sys, os, getopt, array import subprocess import time - import preservicaCheck as token_fn + f_path = os.path.dirname(os.path.realpath(__file__)) islandora_count_f = "./output/membercount.csv" valid_result_f = "./output/valid_result.csv" -tmpToken ="c5b43831-9c2f-4c96-903d-be53b86be835" +st_timer = time.time() +tmp ="" headers = { - 'Preservica-Access-Token': tmpToken + 'Preservica-Access-Token': tmp } curr_session=[] @@ -28,9 +29,8 @@ sInfoObj_baseUrl = "https://pitt.preservica.com/api/entity/information-objects/" sContentObj_baseUrl = "https://pitt.preservica.com/api/entity/content-objects/" - InfoObjdata = defaultdict(list) -def getObjInfo(apiUrl): # -------------PASS--------------# +def getObjInfo(apiUrl): InfoObjdata = defaultdict(list) try: responses = requests.get(apiUrl, headers=headers) @@ -130,7 +130,7 @@ def getRepresentionInfo(sUrl, sRef_ID): contentObj_lst[next(iter(listOfRepresentContent))] = BitstreamCount else: contentObj_lst[next(iter(listOfRepresentContent))] += BitstreamCount - #print(contentObj_lst) #this should be the total bitstreams under the InformationObject + #this should be the total bitstreams under the InformationObject return (contentObj_lst) except requests.exceptions.RequestException as e: print("Error: ", e) @@ -146,8 +146,18 @@ def preservica_bitstream_valid (f_in): f_writer = csv.DictWriter(result_f, fieldnames=header_lst) f_writer.writeheader() #now iterate each objs from response - start_time =time.time() + global st_timer, curr_session + for row in csvreader: + #add logic to check token expiration before access preservica apis + #print(f"usage: {(time.time()-st_timer)*10**3:.02f} ms", curr_session) + if (round((time.time()-st_timer)*10**3) - 600000 >0 ): + new_session = token_fn.getRefreshToken(curr_session) + curr_session[0]= new_session[0] + curr_session[1] =new_session[1] + headers['Preservica-Access-Token']= new_session[0] + st_timer = time.time() + bitstream_dict ={} bitstream_dict = getRepresentionInfo(sInfoObj_baseUrl, row['preservica_refID']) if bitstream_dict: @@ -156,40 +166,15 @@ def preservica_bitstream_valid (f_in): f_writer.writerow({header_lst[0]:row['PID'], header_lst[1]:row['num_isPageOf_uri_s'], header_lst[2]:k, header_lst[3]:v}) else: + tmp = "Mismatch-" + str(v) f_writer.writerow({header_lst[0]:row['PID'], header_lst[1]:row['num_isPageOf_uri_s'], - header_lst[2]:k, header_lst[3]:""}) - - print(f"usage: {(time.time()-start_time)*10**3:.03f} ms") - print(curr_session) - #need regenerate a refresh token - if ((time.time()-start_time) - 600000 >0 ): - new_session = token_fn.getRefreshToken(curr_session) - curr_session[0]= new_session[0] - curr_session[1] =new_session[1] - headers['Preservica-Access-Token']= new_session[0] - print(headers) - - -def drushfetchPids(): - file_name = curr = os.getcwd() +"/output/drush_pid.csv" - user = os.environ['USER'] if os.getenv("USER") is not None else os.environ['USERNAME'] - squery = 'RELS_EXT_preservicaRef_literal_s:* ' - squery += 'AND (RELS_EXT_hasModel_uri_ms:info:fedora/islandora:manuscriptCModel OR RELS_EXT_hasModel_uri_ms:info:fedora/islandora:newspaperIssueCModel OR RELS_EXT_hasModel_uri_ms:info:fedora/islandora:bookCModel) ' - squery += 'AND NOT RELS_EXT_preservicaChildCount_literal_s:*' - try: - s = subprocess.check_call (['drush', '--root=/var/www/html/drupal7/', '--user={}'.format(user), \ - '--uri=http://gamera.library.pitt.edu', 'islandora_datastream_crud_fetch_pids', \ - '--solr_query={}'.format(squery), '--pid_file={}'.format(file_name)]) + header_lst[2]:k, header_lst[3]:tmp}) - except subprocess.CalledProcessError as e: - print(f"Command failed with return code {e.returncode}") - if __name__ == "__main__": - #drushfetchPids() curr_session = token_fn.generateToken() print("token :" ,curr_session[0], " refresh-token: ", curr_session[1]) headers['Preservica-Access-Token'] = curr_session[0] - + st_timer = time.time() preservica_bitstream_valid(islandora_count_f) diff --git a/rdfUpdate.py b/rdfUpdate.py index 1334020..0691ae6 100644 --- a/rdfUpdate.py +++ b/rdfUpdate.py @@ -31,7 +31,7 @@ def getVerifiedPids(f_pids): with open(os.path.join(f_path, f_output), 'w', newline='') as temp_f: csvwriter = csv.DictWriter(temp_f, fieldnames=header_fields) for r in csvreader: - if (r["bitstreamCount"]): # verified the match + if (r["bitstreamCount"] and ("Mismatch" not in r["bitstreamCount"])): # verified the match csvwriter.writerow({header_fields[0]:r["PID"]}) #2.Iterate the intake pids and extract the ext-rel file via drush from islandora @@ -79,13 +79,13 @@ def updateExtRelFile(fpath, fname, e_name): with open (os.path.join( f_path, fname ), 'r', newline='') as pf: csvreader = csv.DictReader(pf) for r in csvreader: - if (r["bitstreamCount"]): # pid with the verified countmatch - #find the ext file matching the name r['PID'], might use re - tmp_pattern = r['PID'] + "^RELS-EXT.rdf" - for file in os.listdir(curr+fpath): - if fnmatch.fnmatch(file, tmp_pattern): - print(file) - fileProcess(file, e_name, r["bitstreamCount"]) + if (r["bitstreamCount"] and ("Mismatch" not in r["bitstreamCount"])): # pid with the verified countmatch + #find the ext file matching the name r['PID'], might use re + tmp_pattern = r['PID'] + "^RELS-EXT.rdf" + for file in os.listdir(curr+fpath): + if fnmatch.fnmatch(file, tmp_pattern): + print(file) + fileProcess(file, e_name, r["bitstreamCount"]) #5. push modified .rdf to islandora via drush def drushpushDatastreams(): From 2edc2095920e441501e3adf6dcccd2465c366d5b Mon Sep 17 00:00:00 2001 From: Ruiling Zhang Date: Thu, 11 Jul 2024 10:29:36 -0400 Subject: [PATCH 4/7] update README and add output sample --- README.md | 13 ++++++++----- output/valid_result.csv | 12 ++++++++++++ preservicaObjCapture.py | 2 ++ 3 files changed, 22 insertions(+), 5 deletions(-) create mode 100644 output/valid_result.csv diff --git a/README.md b/README.md index b3336d6..ccfaeac 100644 --- a/README.md +++ b/README.md @@ -18,15 +18,18 @@ This command takes a single argument of the path to the downloaded file. The script will add (or replace) the element of `islandora:preservicaRef` to the RELS-EXT with the value of the Preservica Ref identifier. -## islandoraObjectCheck.py +## The python files described below are to utilize the comparison between pageOf members of islandora objects and bitstreams of the corresponding objects from preservica + +#### Software specification: *Python 3.8+ recommended* +### islandoraObjectCheck.py The file is to intake a pidlist file and search through islandora objects via solr admin, then process the response and compute the total child pageitems of the object, as well as the preservia reference ID associated to the object -## preservicaCheck.py -The file is to generate preservica token to access the preservica restful apis +### preservicaCheck.py +The file is to generate preservica token to access the preservica restful apis. Apply preservica's authorized user/pw before execution. -## preservicaObjCapture.py +### preservicaObjCapture.py The file is to intake islandora's objects' pageOf member counts and compare the bitstreams count from the corresponding preservica objects. The script also execute a drush command to export the rdf for the countMatched objects. -## rdfUpdate.py +### rdfUpdate.py The script is to iterate all the updated rdf files and use drush push back to islandora. diff --git a/output/valid_result.csv b/output/valid_result.csv new file mode 100644 index 0000000..7edf371 --- /dev/null +++ b/output/valid_result.csv @@ -0,0 +1,12 @@ +PID,islandora_count,preservica_refID,bitstreamCount +pitt:31735073061008,29,e55870c4-2b5b-48a6-a2f9-c3d13f2a96b0,29 +pitt:31735073060950,3,0e40ac91-6c79-4e58-af29-8abe2a59659f,3 +pitt:31735073060893,9,d2102e2b-d769-4492-9939-31861c1a1e30,9 +pitt:31735073060927,6,4b2346c0-6e83-4e64-96ac-77a6ad220734,6 +pitt:2000.07.008,1,452e7fee-7034-4158-bd5e-92efb9aaa5f1,1 +pitt:2000.07.014,1,13b0fafd-76a5-4cfb-be27-2c8e986584a7,1 +pitt:2000.07.009,1,a34130e1-5acd-40bd-addc-e5fb47853737,1 +pitt:2000.07.010,1,79c25b21-1f0d-492c-937e-b07a756ddc1e,1 +pitt:2000.07.011,1,3ae034b2-d09a-44f2-9b77-cc326c9bdd76,1 +pitt:2000.07.063,1,883fdbfa-34e8-4687-90e7-b53910a1d453,1 +pitt:1935e49702,272,d2e0a615-898e-4896-a176-ad00b907fa82,272 diff --git a/preservicaObjCapture.py b/preservicaObjCapture.py index 62c8dfa..aa5b258 100644 --- a/preservicaObjCapture.py +++ b/preservicaObjCapture.py @@ -165,10 +165,12 @@ def preservica_bitstream_valid (f_in): if v == int(row['num_isPageOf_uri_s']): f_writer.writerow({header_lst[0]:row['PID'], header_lst[1]:row['num_isPageOf_uri_s'], header_lst[2]:k, header_lst[3]:v}) + print("ObjectID: ", row['PID'] , "membercounts is matched ") else: tmp = "Mismatch-" + str(v) f_writer.writerow({header_lst[0]:row['PID'], header_lst[1]:row['num_isPageOf_uri_s'], header_lst[2]:k, header_lst[3]:tmp}) + print("ObjectID: ", row['PID'] , " membercounts is not matched ") if __name__ == "__main__": curr_session = token_fn.generateToken() From e1bade557066e2de1bc26e392876b7f473f74ca0 Mon Sep 17 00:00:00 2001 From: Ruiling Zhang Date: Fri, 12 Jul 2024 18:41:46 -0400 Subject: [PATCH 5/7] add valid flg and modify login credentials process, refactor the code --- README.md | 1 - .../.gitignore | 0 islandora-preservica-validation/README.md | 17 ++++++++++ .../input}/file-pids-sample.csv | 8 ++--- .../islandoraObjCheck.py | 0 .../output/valid_result-sample.csv | 10 ++++++ .../preservicaCheck.py | 33 +++++++++++++++++++ .../preservicaObjCapture.py | 20 +++++------ .../rdfUpdate.py | 20 +++++++---- output/valid_result.csv | 12 ------- preservicaCheck.py | 27 --------------- 11 files changed, 84 insertions(+), 64 deletions(-) rename .gitignore => islandora-preservica-validation/.gitignore (100%) create mode 100644 islandora-preservica-validation/README.md rename {input => islandora-preservica-validation/input}/file-pids-sample.csv (54%) rename islandoraObjCheck.py => islandora-preservica-validation/islandoraObjCheck.py (100%) create mode 100644 islandora-preservica-validation/output/valid_result-sample.csv create mode 100644 islandora-preservica-validation/preservicaCheck.py rename preservicaObjCapture.py => islandora-preservica-validation/preservicaObjCapture.py (93%) rename rdfUpdate.py => islandora-preservica-validation/rdfUpdate.py (87%) delete mode 100644 output/valid_result.csv delete mode 100644 preservicaCheck.py diff --git a/README.md b/README.md index ccfaeac..91412a5 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,6 @@ The script will add (or replace) the element of `islandora:preservicaRef` to the ## The python files described below are to utilize the comparison between pageOf members of islandora objects and bitstreams of the corresponding objects from preservica -#### Software specification: *Python 3.8+ recommended* ### islandoraObjectCheck.py The file is to intake a pidlist file and search through islandora objects via solr admin, then process the response and compute the total child pageitems of the object, as well as the preservia reference ID associated to the object diff --git a/.gitignore b/islandora-preservica-validation/.gitignore similarity index 100% rename from .gitignore rename to islandora-preservica-validation/.gitignore diff --git a/islandora-preservica-validation/README.md b/islandora-preservica-validation/README.md new file mode 100644 index 0000000..3559be8 --- /dev/null +++ b/islandora-preservica-validation/README.md @@ -0,0 +1,17 @@ +## Description +islandora-preservica-validation process is to compare islandora objects's pagemember count with the corresponding preservica objects's bitstreams count. Islandora Object is validated if the counts are matched, and the islandora object's rdf is to updated by adding new element with the value of the number of count. + +### Requirements + * Python 3.12 + * pip requirements.txt + +### Process +* execute islandoraObjCheck.py to retrieve all islandora objects needed. It will generates an outputfile containing objectID and objects' page membercount and the corresponding preservics object reference Ids +* execute preservicaObjCapture.py to valide the bitstreams count from preservica with islandora. It will prompt user to use the preservica login credentials in order to generate preservica RESTful APIs +* execute rdfUpdate.py to update the rdfs for the validated islandora objects and drush to push back the updaexecute rdfUpdate.py to update the rdfs for the validated islandora objects and drush to push back the updates to islandora + +## Disclaimer + + THIS SCRIPT IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT + LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + diff --git a/input/file-pids-sample.csv b/islandora-preservica-validation/input/file-pids-sample.csv similarity index 54% rename from input/file-pids-sample.csv rename to islandora-preservica-validation/input/file-pids-sample.csv index 3a062f9..f1b26c8 100644 --- a/input/file-pids-sample.csv +++ b/islandora-preservica-validation/input/file-pids-sample.csv @@ -1,11 +1,9 @@ pitt:31735073061008 -pitt:31735073060950 -pitt:31735073060893 pitt:31735073060927 -pitt:2000.07.008 -pitt:2000.07.014 -pitt:2000.07.009 +pitt:31735073060901 +pitt:31735073060943 pitt:2000.07.010 pitt:2000.07.011 pitt:2000.07.063 pitt:1935e49702 +pitt:193xe49702 diff --git a/islandoraObjCheck.py b/islandora-preservica-validation/islandoraObjCheck.py similarity index 100% rename from islandoraObjCheck.py rename to islandora-preservica-validation/islandoraObjCheck.py diff --git a/islandora-preservica-validation/output/valid_result-sample.csv b/islandora-preservica-validation/output/valid_result-sample.csv new file mode 100644 index 0000000..56ee9d2 --- /dev/null +++ b/islandora-preservica-validation/output/valid_result-sample.csv @@ -0,0 +1,10 @@ +PID,islandora_count,preservica_refID,bitstreamCount,isValid +pitt:31735073061008,29,e55870c4-2b5b-48a6-a2f9-c3d13f2a96b0,29,Y +pitt:31735073060927,6,4b2346c0-6e83-4e64-96ac-77a6ad220734,6,Y +pitt:31735073060901,23,26f2f20b-4806-431b-85e3-a86cbb6fa425,23,Y +pitt:31735073060943,8,bc19d052-068a-494d-aefb-4b28119dfb7e,8,Y +pitt:2000.07.010,1,79c25b21-1f0d-492c-937e-b07a756ddc1e,1,Y +pitt:2000.07.011,1,3ae034b2-d09a-44f2-9b77-cc326c9bdd76,1,Y +pitt:2000.07.063,1,883fdbfa-34e8-4687-90e7-b53910a1d453,1,Y +pitt:1935e49702,272,d2e0a615-898e-4896-a176-ad00b907fa82,272,Y +pitt:193xe49702,288,8f0b8647-1209-49d3-a6b7-6a8646f225d7,287,N diff --git a/islandora-preservica-validation/preservicaCheck.py b/islandora-preservica-validation/preservicaCheck.py new file mode 100644 index 0000000..c686735 --- /dev/null +++ b/islandora-preservica-validation/preservicaCheck.py @@ -0,0 +1,33 @@ +import requests +import getopt, sys +from getpass import getpass + +#generate token to access preservica restful api +sUrl="https://pitt.preservica.com/api/accesstoken/login" +headers = {"Contnent-Type": "application/x-www-form-urlencoded"} + +def generateToken(): + #retrieve login credentials + testusr = getpass("Please enter login: ") + testpw =getpass("Please enter password: ") + if (testpw and testusr): + sdata ={ + "username": testusr, + "password": testpw + } + #retrieve token + r = requests.post (sUrl, data=sdata, headers=headers) + if r.status_code != 200: + print("Error:" , r.status_code) + sys.exit(-1) + else: + return [r.json()['token'], r.json()['refresh-token']] + +def getRefreshToken(s): + sRefreshUrl ="https://pitt.preservica.com/api/accesstoken/refresh?refreshToken=" + s[1] + newheaders = {"Preservica-Access-Token" : s[0], + "Contnent-Type": "application/x-www-form-urlencoded"} + + res = requests.post (sRefreshUrl, headers=newheaders) + if res.status_code == 200: + return [res.json()['token'], res.json()['refresh-token']] diff --git a/preservicaObjCapture.py b/islandora-preservica-validation/preservicaObjCapture.py similarity index 93% rename from preservicaObjCapture.py rename to islandora-preservica-validation/preservicaObjCapture.py index aa5b258..d847773 100644 --- a/preservicaObjCapture.py +++ b/islandora-preservica-validation/preservicaObjCapture.py @@ -23,7 +23,6 @@ headers = { 'Preservica-Access-Token': tmp } - curr_session=[] sInfoObj_baseUrl = "https://pitt.preservica.com/api/entity/information-objects/" @@ -130,7 +129,6 @@ def getRepresentionInfo(sUrl, sRef_ID): contentObj_lst[next(iter(listOfRepresentContent))] = BitstreamCount else: contentObj_lst[next(iter(listOfRepresentContent))] += BitstreamCount - #this should be the total bitstreams under the InformationObject return (contentObj_lst) except requests.exceptions.RequestException as e: print("Error: ", e) @@ -142,20 +140,19 @@ def preservica_bitstream_valid (f_in): csvreader = csv.DictReader(islandoraCount_f) with open(os.path.join(f_path, valid_result_f), 'w', newline='') as result_f: - header_lst = ['PID', 'islandora_count', 'preservica_refID', 'bitstreamCount'] + header_lst = ['PID', 'islandora_count', 'preservica_refID', 'bitstreamCount','isValid'] f_writer = csv.DictWriter(result_f, fieldnames=header_lst) f_writer.writeheader() - #now iterate each objs from response - global st_timer, curr_session + global st_timer, curr_session for row in csvreader: - #add logic to check token expiration before access preservica apis - #print(f"usage: {(time.time()-st_timer)*10**3:.02f} ms", curr_session) + #logic to check token expiration if (round((time.time()-st_timer)*10**3) - 600000 >0 ): new_session = token_fn.getRefreshToken(curr_session) curr_session[0]= new_session[0] curr_session[1] =new_session[1] headers['Preservica-Access-Token']= new_session[0] + print("Check new token: ", curr_session[0]) st_timer = time.time() bitstream_dict ={} @@ -164,13 +161,12 @@ def preservica_bitstream_valid (f_in): for k,v in bitstream_dict.items(): if v == int(row['num_isPageOf_uri_s']): f_writer.writerow({header_lst[0]:row['PID'], header_lst[1]:row['num_isPageOf_uri_s'], - header_lst[2]:k, header_lst[3]:v}) - print("ObjectID: ", row['PID'] , "membercounts is matched ") + header_lst[2]:k, header_lst[3]:v, header_lst[4]:"Y"}) + print( row['PID'] , " membercounts matched preservica bitstreams ", v) else: - tmp = "Mismatch-" + str(v) f_writer.writerow({header_lst[0]:row['PID'], header_lst[1]:row['num_isPageOf_uri_s'], - header_lst[2]:k, header_lst[3]:tmp}) - print("ObjectID: ", row['PID'] , " membercounts is not matched ") + header_lst[2]:k, header_lst[3]:v, header_lst[4]:"N"}) + print(row['PID'] , "membercounts " , row['num_isPageOf_uri_s'], " mismatched preservica bitstreams ", v ) if __name__ == "__main__": curr_session = token_fn.generateToken() diff --git a/rdfUpdate.py b/islandora-preservica-validation/rdfUpdate.py similarity index 87% rename from rdfUpdate.py rename to islandora-preservica-validation/rdfUpdate.py index 0691ae6..89b2927 100644 --- a/rdfUpdate.py +++ b/islandora-preservica-validation/rdfUpdate.py @@ -31,7 +31,7 @@ def getVerifiedPids(f_pids): with open(os.path.join(f_path, f_output), 'w', newline='') as temp_f: csvwriter = csv.DictWriter(temp_f, fieldnames=header_fields) for r in csvreader: - if (r["bitstreamCount"] and ("Mismatch" not in r["bitstreamCount"])): # verified the match + if (r["bitstreamCount"] and (r["isValid"]=="Y")): csvwriter.writerow({header_fields[0]:r["PID"]}) #2.Iterate the intake pids and extract the ext-rel file via drush from islandora @@ -79,8 +79,7 @@ def updateExtRelFile(fpath, fname, e_name): with open (os.path.join( f_path, fname ), 'r', newline='') as pf: csvreader = csv.DictReader(pf) for r in csvreader: - if (r["bitstreamCount"] and ("Mismatch" not in r["bitstreamCount"])): # pid with the verified countmatch - #find the ext file matching the name r['PID'], might use re + if (r["bitstreamCount"] and (r["isValid"]=="Y")): tmp_pattern = r['PID'] + "^RELS-EXT.rdf" for file in os.listdir(curr+fpath): if fnmatch.fnmatch(file, tmp_pattern): @@ -98,15 +97,22 @@ def drushpushDatastreams(): print(f"Command failed with return code {e.returncode}") if __name__ == "__main__": + #make clean directories before retrieving datastreams + if os.path.exists(curr + extRel_fpath) and os.path.isdir(curr + extRel_fpath): + shutil.rmtree(curr + extRel_fpath) + if os.path.exists(curr + update_fpath) and os.path.isdir(curr + update_fpath): + shutil.rmtree(curr + update_fpath) + + #extract the datastreams for matched objects getVerifiedPids(valid_result_f) drushfetchDatastream() - ##copy all the file in an update_fpath to use for testing purpose - org_files = os.listdir(curr+extRel_fpath) - shutil.copytree(curr+extRel_fpath, curr+update_fpath) + #keep original datastreams for testing purpose + org_files = os.listdir(curr + extRel_fpath) + shutil.copytree(curr+extRel_fpath, curr + update_fpath) newTagName = "preservicaChildCount" - updateExtRelFile(update_fpath, valid_result_f,newTagName) + updateExtRelFile(update_fpath, valid_result_f, newTagName) #drushpushDatastreams() diff --git a/output/valid_result.csv b/output/valid_result.csv deleted file mode 100644 index 7edf371..0000000 --- a/output/valid_result.csv +++ /dev/null @@ -1,12 +0,0 @@ -PID,islandora_count,preservica_refID,bitstreamCount -pitt:31735073061008,29,e55870c4-2b5b-48a6-a2f9-c3d13f2a96b0,29 -pitt:31735073060950,3,0e40ac91-6c79-4e58-af29-8abe2a59659f,3 -pitt:31735073060893,9,d2102e2b-d769-4492-9939-31861c1a1e30,9 -pitt:31735073060927,6,4b2346c0-6e83-4e64-96ac-77a6ad220734,6 -pitt:2000.07.008,1,452e7fee-7034-4158-bd5e-92efb9aaa5f1,1 -pitt:2000.07.014,1,13b0fafd-76a5-4cfb-be27-2c8e986584a7,1 -pitt:2000.07.009,1,a34130e1-5acd-40bd-addc-e5fb47853737,1 -pitt:2000.07.010,1,79c25b21-1f0d-492c-937e-b07a756ddc1e,1 -pitt:2000.07.011,1,3ae034b2-d09a-44f2-9b77-cc326c9bdd76,1 -pitt:2000.07.063,1,883fdbfa-34e8-4687-90e7-b53910a1d453,1 -pitt:1935e49702,272,d2e0a615-898e-4896-a176-ad00b907fa82,272 diff --git a/preservicaCheck.py b/preservicaCheck.py deleted file mode 100644 index 2852ac8..0000000 --- a/preservicaCheck.py +++ /dev/null @@ -1,27 +0,0 @@ -import requests - -#generate token to access preservica restful api -sUrl="https://pitt.preservica.com/api/accesstoken/login" -testuser ="testuser@pitt.edu" -testpw="testpassword" -data ={"username": testuser, - "password": testpw - } -headers = {"Contnent-Type": "application/x-www-form-urlencoded"} -def generateToken(): - r = requests.post (sUrl, data=data, headers=headers) - if r.status_code != 200: - print("Error:" , r.status_code) - exit(1) - else: - return [r.json()['token'], r.json()['refresh-token']] - - -def getRefreshToken(s): - sRefreshUrl ="https://pitt.preservica.com/api/accesstoken/refresh?refreshToken=" + s[1] - newheaders = {"Preservica-Access-Token" : s[0], - "Contnent-Type": "application/x-www-form-urlencoded"} - - res = requests.post (sRefreshUrl, headers=newheaders) - if res.status_code == 200: - return [res.json()['token'], res.json()['refresh-token']] From 84137b659df20e640dc866094b5ba5dde506eb06 Mon Sep 17 00:00:00 2001 From: Ruiling Zhang Date: Fri, 12 Jul 2024 18:45:13 -0400 Subject: [PATCH 6/7] moved requirement to validation dir --- .../requirements.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename requirements.txt => islandora-preservica-validation/requirements.txt (100%) diff --git a/requirements.txt b/islandora-preservica-validation/requirements.txt similarity index 100% rename from requirements.txt rename to islandora-preservica-validation/requirements.txt From 56d1559972c1fc4d41d12820cd655d598ecec19b Mon Sep 17 00:00:00 2001 From: Ruiling Zhang Date: Tue, 6 Aug 2024 14:39:33 -0400 Subject: [PATCH 7/7] Used the new preservica api 7.3 version;misc. updates based on code review --- .../preservicaCheck.py | 5 +- .../preservicaObjCapture.py | 52 ++++++++++++------- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/islandora-preservica-validation/preservicaCheck.py b/islandora-preservica-validation/preservicaCheck.py index c686735..5faef24 100644 --- a/islandora-preservica-validation/preservicaCheck.py +++ b/islandora-preservica-validation/preservicaCheck.py @@ -4,7 +4,7 @@ #generate token to access preservica restful api sUrl="https://pitt.preservica.com/api/accesstoken/login" -headers = {"Contnent-Type": "application/x-www-form-urlencoded"} +headers = {"Content-Type": "application/x-www-form-urlencoded"} def generateToken(): #retrieve login credentials @@ -31,3 +31,6 @@ def getRefreshToken(s): res = requests.post (sRefreshUrl, headers=newheaders) if res.status_code == 200: return [res.json()['token'], res.json()['refresh-token']] + else: + print("Error: Failed to get refresh token" , res.status_code) + sys.exit(-1) diff --git a/islandora-preservica-validation/preservicaObjCapture.py b/islandora-preservica-validation/preservicaObjCapture.py index d847773..f3dcd0e 100644 --- a/islandora-preservica-validation/preservicaObjCapture.py +++ b/islandora-preservica-validation/preservicaObjCapture.py @@ -3,8 +3,8 @@ ## process an inputFile containing Islandora Objects'pid,numberOfPage of the Object to valid ## the associated preservica objects by checking the count of the bitstreams for each preservica ## object matching the numberOfPage count from the correspondent islandora object -## @params: islandorapids.csv : file generated from islandoraObjCheck process -## @result: valid_result.csv +## @params: islandora_count_f : file generated from islandoraObjCheck process +## @result: valid_result_f: csv output ########################################################### import requests, json, csv from xml.etree import ElementTree as etree @@ -29,6 +29,11 @@ sContentObj_baseUrl = "https://pitt.preservica.com/api/entity/content-objects/" InfoObjdata = defaultdict(list) +#function to retrieve details of the preservica information object +#@param: sInfoObj_baseUrl bind with the preservica refId +#@return: dict {'PIDInfo': preservica IO identifer, 'representationInfo': IO Representation url} if success, +# otherwise output request error +# def getObjInfo(apiUrl): InfoObjdata = defaultdict(list) try: @@ -38,19 +43,20 @@ def getObjInfo(apiUrl): #process the xml entity_response = etree.fromstring(xml_response) - reference = entity_response.find('.//{http://preservica.com/XIP/v7.2}Ref') - identifier = entity_response.find('.//{http://preservica.com/EntityAPI/v7.2}Identifiers') - representation = entity_response.find('.//{http://preservica.com/EntityAPI/v7.2}Representations') + reference = entity_response.find('.//{http://preservica.com/XIP/v7.3}Ref') + identifier = entity_response.find('.//{http://preservica.com/EntityAPI/v7.3}Identifiers') + representation = entity_response.find('.//{http://preservica.com/EntityAPI/v7.3}Representations') tmpObjInfo = {} tmpObjInfo["PIDInfo"] = identifier.text tmpObjInfo["representationInfo"] = representation.text InfoObjdata[reference.text] = tmpObjInfo - return InfoObjdata except requests.exceptions.RequestException as e: print("Error: ", e) -#capture all ContentObjects from Representations : dict {objectId: contentids} +#capture all ContentObjects from Representations +#@param: preservica CO base apiUrl +#@return: dict {objectId: contentids} def getContentObjID(sRep_Url): r = requests.get(sRep_Url, headers=headers) counter =0 @@ -60,8 +66,8 @@ def getContentObjID(sRep_Url): xml_resRepresentation = str(r.content.decode('UTF-8')) #process the xml res_content_response = etree.fromstring(xml_resRepresentation) - infoObjID = res_content_response.find('.//{http://preservica.com/XIP/v7.2}InformationObject') - contentobjs = res_content_response.findall('.//{http://preservica.com/XIP/v7.2}ContentObjects/{http://preservica.com/XIP/v7.2}ContentObject') + infoObjID = res_content_response.find('.//{http://preservica.com/XIP/v7.3}InformationObject') + contentobjs = res_content_response.findall('.//{http://preservica.com/XIP/v7.3}ContentObjects/{http://preservica.com/XIP/v7.3}ContentObject') tempcontentid =[] for contentobj in contentobjs: @@ -71,6 +77,7 @@ def getContentObjID(sRep_Url): return contentobjdata #make a generic call to retrieve object data from preservica restapi +#helper function def getcontenobjInfo(sObjbaseUrl, sobjitem="", sParam=""): try: r = (sobjitem and sParam) and requests.get(f'{sObjbaseUrl}{sParam}/{sobjitem}', headers=headers) or requests.get(f'{sObjbaseUrl}', headers=headers) @@ -81,22 +88,28 @@ def getcontenobjInfo(sObjbaseUrl, sobjitem="", sParam=""): except requests.exceptions.RequestException as e: print("Error: ", e) -#capture the generations of contentobject id +#capture generations of the passingin contentobject +#@param: sContentGen: generation type +# sRefId: preservica refID +#@return: contBitstream: dict with key as the preservica RefID, value of bitstream numCount def getbitstreamInfo(sContentGen, sRefId): res_gen = getcontenobjInfo(sContentObj_baseUrl, sContentGen , sRefId) - genLst = res_gen.findall('.//{http://preservica.com/EntityAPI/v7.2}Generations/{http://preservica.com/EntityAPI/v7.2}Generation[@active ="true"]') + genLst = res_gen.findall('.//{http://preservica.com/EntityAPI/v7.3}Generations/{http://preservica.com/EntityAPI/v7.3}Generation[@active ="true"]') contBitstream = defaultdict(list) total = 0 #iterate generation to get the bitstream count for ele in genLst: - bitstreamLst = getcontenobjInfo(ele.text).findall('.//{http://preservica.com/XIP/v7.2}Bitstreams/{http://preservica.com/XIP/v7.2}Bitstream') + bitstreamLst = getcontenobjInfo(ele.text).findall('.//{http://preservica.com/XIP/v7.3}Bitstreams/{http://preservica.com/XIP/v7.3}Bitstream') total += len(bitstreamLst) contBitstream[sRefId] = total return contBitstream -#retrieves all representations of InformationObj and compute total bitstreams underneath contentObjs for the representation -def getRepresentionInfo(sUrl, sRef_ID): +#function to retrieves all representations of InformationObj and compute total bitstreams underneath contentObjs for the representation +#@param: sUrl: preservica IO base apiUrl +# sRef_ID: preservica InformObject RefID +#@return: contentObj_lst: array of preserva RefID and its bitstreams count +def getRepresentationInfo(sUrl, sRef_ID): stempUrl = sUrl + sRef_ID testInfoObj = getObjInfo(stempUrl) if len(testInfoObj) > 0: @@ -107,14 +120,14 @@ def getRepresentionInfo(sUrl, sRef_ID): xml_reqRep = str(req_Rep.content.decode('UTF-8')) representation_rep = etree.ElementTree(etree.fromstring(xml_reqRep)) representations = representation_rep.findall( - './/{http://preservica.com/EntityAPI/v7.2}Representations/{http://preservica.com/EntityAPI/v7.2}Representation' + './/{http://preservica.com/EntityAPI/v7.3}Representations/{http://preservica.com/EntityAPI/v7.3}Representation' ) contentObj_lst ={} BitstreamCount =0 for representation in representations: #print(representation.text, " ", representation.attrib) - #get all content object IDs for each of the represention + #get all content object IDs for each of the representation listOfRepresentContent = getContentObjID(representation.text) if len(listOfRepresentContent) > 0 : #get value of the first key since only one element @@ -132,6 +145,7 @@ def getRepresentionInfo(sUrl, sRef_ID): return (contentObj_lst) except requests.exceptions.RequestException as e: print("Error: ", e) + #match the count, if match update the xml, otherwise output log error def preservica_bitstream_valid (f_in): @@ -146,7 +160,7 @@ def preservica_bitstream_valid (f_in): global st_timer, curr_session for row in csvreader: - #logic to check token expiration + #logic to check token expiration using 10 mins as token refresh cycle if (round((time.time()-st_timer)*10**3) - 600000 >0 ): new_session = token_fn.getRefreshToken(curr_session) curr_session[0]= new_session[0] @@ -156,7 +170,7 @@ def preservica_bitstream_valid (f_in): st_timer = time.time() bitstream_dict ={} - bitstream_dict = getRepresentionInfo(sInfoObj_baseUrl, row['preservica_refID']) + bitstream_dict = getRepresentationInfo(sInfoObj_baseUrl, row['preservica_refID']) if bitstream_dict: for k,v in bitstream_dict.items(): if v == int(row['num_isPageOf_uri_s']): @@ -167,7 +181,7 @@ def preservica_bitstream_valid (f_in): f_writer.writerow({header_lst[0]:row['PID'], header_lst[1]:row['num_isPageOf_uri_s'], header_lst[2]:k, header_lst[3]:v, header_lst[4]:"N"}) print(row['PID'] , "membercounts " , row['num_isPageOf_uri_s'], " mismatched preservica bitstreams ", v ) - + if __name__ == "__main__": curr_session = token_fn.generateToken() print("token :" ,curr_session[0], " refresh-token: ", curr_session[1])