ulsdevteam · rzhang152 · Jun 25, 2024 · Jul 9, 2024 · Jul 10, 2024 · Jul 11, 2024
diff --git a/README.md b/README.md
@@ -17,3 +17,18 @@ A script to examine a download of a Process (Ingest Card) from the Preserica Mon
 This command takes a single argument of the path to the downloaded file.
 
 The script will add (or replace) the element of `islandora:preservicaRef` to the RELS-EXT with the value of the Preservica Ref identifier.
+
+## The python files described below are to utilize the comparison between pageOf members of islandora objects and bitstreams of the corresponding objects from preservica
+
+### islandoraObjectCheck.py
+The file is to intake a pidlist file and search through islandora objects via solr admin, then process the response and compute the total child pageitems of the object, as well as the preservia reference ID associated to the object
+
+### preservicaCheck.py
+The file is to generate preservica token to access the preservica restful apis. Apply preservica's authorized user/pw before execution.
+
+### preservicaObjCapture.py
+The file is to intake islandora's objects' pageOf member counts and compare the bitstreams count from the corresponding preservica objects. The script also execute a drush command to export the rdf for the countMatched objects.
+
+### rdfUpdate.py
+The script is to iterate all the updated rdf files and use drush push back to islandora.
+
diff --git a/islandora-preservica-validation/.gitignore b/islandora-preservica-validation/.gitignore
@@ -0,0 +1,8 @@
+*.log
+*.txt
+*.xml
+.env
+
+#sample testing file
+input/
+output/
diff --git a/islandora-preservica-validation/README.md b/islandora-preservica-validation/README.md
@@ -0,0 +1,17 @@
+## Description
+islandora-preservica-validation process is to compare islandora objects's pagemember count with the corresponding preservica objects's bitstreams count. Islandora Object is validated if the counts are matched, and the islandora object's rdf is to updated by adding new element with the value of the number of count. 
+
+### Requirements
+ * Python 3.12
+ * pip requirements.txt
+
+### Process
+* execute islandoraObjCheck.py to retrieve all islandora objects needed. It will generates an outputfile containing objectID and objects' page membercount and the corresponding preservics object reference Ids 
+* execute preservicaObjCapture.py to valide the bitstreams count from preservica with islandora. It will prompt user to use the preservica login credentials in order to generate preservica RESTful APIs
+* execute rdfUpdate.py to update the rdfs for the validated islandora objects and drush to push back the updaexecute rdfUpdate.py to update the rdfs for the validated islandora objects and drush to push back the updates to islandora
+
+## Disclaimer
+
+  THIS SCRIPT IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
+  LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
diff --git a/islandora-preservica-validation/input/file-pids-sample.csv b/islandora-preservica-validation/input/file-pids-sample.csv
@@ -0,0 +1,9 @@
+pitt:31735073061008
+pitt:31735073060927
+pitt:31735073060901
+pitt:31735073060943
+pitt:2000.07.010
+pitt:2000.07.011
+pitt:2000.07.063
+pitt:1935e49702
+pitt:193xe49702
diff --git a/islandora-preservica-validation/islandoraObjCheck.py b/islandora-preservica-validation/islandoraObjCheck.py
@@ -0,0 +1,116 @@
+##Function:  pageCount_of_Pid
+## readin a pidlist file containing pitt identifiers and process through
+## islandora object api request to compute the number of the pids' child  
+## objects with the filter 'RELS_EXT_isPageOf_uri_s' on objects' metadata
+## @params: file_pids
+## @result: file_PgCount
+
+import requests, json, os
+import csv
+import subprocess 
+from collections import defaultdict
+
+f_path = os.path.dirname(os.path.realpath(__file__)) 
+file_pids = "./input/file-pids.csv"   #intakes pidfile
+file_pgCount="./output/membercount.csv" 
+
+#retrieve Object and its pageOf members from islandora 
+def get_islandoraData(s_query):
+    #pid format convention
+    q_par = "PID:pitt\\" + s_query[4:]
+    q_pages = "RELS_EXT_isPageOf_uri_s: info\\:fedora\\/pitt\\" + s_query[4:] + " OR " + q_par
+    try:
+        #step1). retrieve object from islandora api request
+        url ='https://gamera.library.pitt.edu/solr/uls_digital_core/select'
+        payload = {"q": q_pages,
+                "fl":"PID,RELS_EXT_isPageOf_uri_s,RELS_EXT_hasModel_uri_ms,RELS_EXT_preservicaRef_literal_s",
+                "sort":"PID asc",
+                "rows":"100000",
+                "wt":"json"}
+
+        responses = requests.get(url, params=payload)
+        if (responses.status_code ==200) :
+            json_data = (responses.json())
+            results = json_data['response']
+            #print(json.dumps(ms_items, indent=4))
+            return (results)
+    except requests.exceptions.HTTPError as e:
+        print("Error: " + str(e))
+
+#define a dict value with a value of list holding islandora object and its pageOf count
+ms_items = defaultdict(list)
+
+# Helper function to compute the multpart objects via the relation mapping
+# 'RELS_EXT_isPageOf_uri_s' to the Object PID from solr api response
+def get_multipart_count(objID):
+    results = get_islandoraData(objID)
+    numOfpages = results['numFound']
+    s_preservicaRef ="" 
+    #make sure the response data is a dict
+    assert isinstance(results, dict) 
+
+    for data in results['docs']:
+        tmpPagelst = defaultdict(list)
+        #capture the preservica reference ID associated to the ObjectID, if existing
+        if ( "RELS_EXT_preservicaRef_literal_s" in data):
+            s_preservicaRef = data["RELS_EXT_preservicaRef_literal_s"]
+            numOfpages -=1   #exclude parent Object
+
+        #pass objID to solr to retrieve childcontent from islandora
+        if ("RELS_EXT_isPageOf_uri_s" in data):
+            #retrieve parent object associated
+            uri_obj = data["RELS_EXT_isPageOf_uri_s"].split("/")[-1]
+            if not ( uri_obj in ms_items.keys()):      
+                tmpPagelst['counter'] = 1
+                ms_items[uri_obj]=tmpPagelst
+            else:
+                #update the value for the key matching object ID
+                v= [v for k,v in ms_items.items() if k == uri_obj]
+                v[0]["counter"] += 1 
+
+    #export the associated preservica reference ID if existing           
+    if (s_preservicaRef):
+        val = [val for keyId, val in ms_items.items() if keyId==objID]
+        if val:
+            val[0]['preservica_RefID'] = s_preservicaRef
+
+    return ms_items
+
+# Main Function: takes in PIDfile in the format {PID}. It iterates pids to check on islandora via 
+# solr search, and outputs a csv file containing total# of the Object's pageOf items from islandora, and 
+# preservica referenceID associated to the pid, if exising
+def pageCount_of_Pid (inFile_pids):
+    with open (os.path.join(f_path, inFile_pids), 'r') as pid_f:
+        pidreader = csv.reader(pid_f)
+
+        #write output file
+        with open(os.path.join(f_path, file_pgCount), 'w', newline='') as match_f:
+            header_lst = ['PID', 'num_isPageOf_uri_s', 'preservica_refID']
+            f_writer = csv.writer(match_f, delimiter=',')
+            f_writer.writerow(header_lst)
+            #now iterate each objs from response
+            for row in pidreader:
+                mydict = get_multipart_count(row[0])  
+
+            if mydict:
+                for k,v in mydict.items():
+                    f_writer.writerow([k, v['counter'], v['preservica_RefID']])
+
+def drushfetchPids(): 
+    file_name = os.getcwd() +"/input/file-pids.csv"
+    user = os.environ['USER'] if os.getenv("USER") is not None else os.environ['USERNAME']
+    squery = 'RELS_EXT_preservicaRef_literal_s:* ' 
+    squery += 'AND (RELS_EXT_hasModel_uri_ms:info\:fedora/islandora\:manuscriptCModel OR RELS_EXT_hasModel_uri_ms:info\:fedora/islandora\:newspaperIssueCModel OR RELS_EXT_hasModel_uri_ms:info\:fedora/islandora\:bookCModel)'
+    squery += 'AND NOT RELS_EXT_preservicaChildCount_literal_s:*'
+
+    try:
+        s = subprocess.check_call (['drush', '--root=/var/www/html/drupal7/', '--user={}'.format(user), \
+    '--uri=http://gamera.library.pitt.edu', 'islandora_datastream_crud_fetch_pids',  \
+    '--solr_query={}'.format(squery), '--pid_file={}'.format(file_name)])
+
+    except subprocess.CalledProcessError as e: 
+	    print(f"Command failed with return code {e.returncode}")
+
+if __name__ == "__main__":
+    drushfetchPids()
+    pageCount_of_Pid(file_pids)
diff --git a/islandora-preservica-validation/output/valid_result-sample.csv b/islandora-preservica-validation/output/valid_result-sample.csv
@@ -0,0 +1,10 @@
+PID,islandora_count,preservica_refID,bitstreamCount,isValid
+pitt:31735073061008,29,e55870c4-2b5b-48a6-a2f9-c3d13f2a96b0,29,Y
+pitt:31735073060927,6,4b2346c0-6e83-4e64-96ac-77a6ad220734,6,Y
+pitt:31735073060901,23,26f2f20b-4806-431b-85e3-a86cbb6fa425,23,Y
+pitt:31735073060943,8,bc19d052-068a-494d-aefb-4b28119dfb7e,8,Y
+pitt:2000.07.010,1,79c25b21-1f0d-492c-937e-b07a756ddc1e,1,Y
+pitt:2000.07.011,1,3ae034b2-d09a-44f2-9b77-cc326c9bdd76,1,Y
+pitt:2000.07.063,1,883fdbfa-34e8-4687-90e7-b53910a1d453,1,Y
+pitt:1935e49702,272,d2e0a615-898e-4896-a176-ad00b907fa82,272,Y
+pitt:193xe49702,288,8f0b8647-1209-49d3-a6b7-6a8646f225d7,287,N
diff --git a/islandora-preservica-validation/preservicaCheck.py b/islandora-preservica-validation/preservicaCheck.py
@@ -0,0 +1,36 @@
+import requests
+import getopt, sys
+from getpass import getpass
+
+#generate token to access preservica restful api
+sUrl="https://pitt.preservica.com/api/accesstoken/login"
+headers = {"Content-Type": "application/x-www-form-urlencoded"}
+
+def generateToken():  
+    #retrieve login credentials
+    testusr = getpass("Please enter login: ")
+    testpw =getpass("Please enter password: ")
+    if (testpw and testusr):
+        sdata ={
+            "username": testusr,
+            "password": testpw
+            }
+         #retrieve token    
+        r = requests.post (sUrl, data=sdata, headers=headers)
+        if r.status_code != 200:
+            print("Error:" , r.status_code)
+            sys.exit(-1)
+        else:
+            return [r.json()['token'], r.json()['refresh-token']]
+
+def getRefreshToken(s):
+    sRefreshUrl ="https://pitt.preservica.com/api/accesstoken/refresh?refreshToken=" + s[1]
+    newheaders = {"Preservica-Access-Token" : s[0],
+                  "Contnent-Type": "application/x-www-form-urlencoded"}
+
+    res = requests.post (sRefreshUrl, headers=newheaders)
+    if res.status_code == 200:
+        return [res.json()['token'], res.json()['refresh-token']]
+    else:
+        print("Error: Failed to get refresh token" , res.status_code)
+        sys.exit(-1)