forked from jpstroop/URIs-to-EAD
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathowi.py
102 lines (94 loc) · 3.24 KB
/
owi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
#-*- coding: utf-8 -*-
"""
A simple, one-off, experimental script to get OCLC Work IDs (OWIs) into a batch of bib records. Here, they went into 787$o.
Uses pymarc, libxml2 and xmllint.
NOTE: There's a quota of 1,000 queries per day by default (this isn't immediately obvious). Check the following:
http://oclc.org/developer/develop/linked-data/worldcat-entities/worldcat-work-entity.en.html
http://www.oclc.org/developer/develop/web-services/xid-api.en.html
"""
from time import sleep
import libxml2
import os
import pickle
import pymarc
import requests
import shelve
import subprocess
import sys
XID_RESOLVER = "http://xisbn.worldcat.org/webservices/xid/oclcnum/%s"
WORK_ID = "http://worldcat.org/entity/work/id/"
SHELF_FILE = "./owi.db"
infile = "./input.marc.xml"
outfile = "./output_w_owis.marc.xml"
def check_shelf(ocn):
shelf = shelve.open(SHELF_FILE, protocol=pickle.HIGHEST_PROTOCOL)
if ocn in shelf:
workid = shelf[ocn]
os.sys.stdout.write("[Cache] Found: " + ocn + "\n")
else:
workid = query_oclc(ocn)
if workid != None and workid != '':
shelf[ocn] = workid
os.sys.stdout.write('put %s %s into db\n' % (ocn,workid))
shelf.close()
return workid
def query_oclc(xid):
'''
See the following for parameters:
http://www.oclc.org/developer/develop/web-services/xid-api/xstandardNumber-resource.en.html
'''
to_get = XID_RESOLVER % xid
to_get += "?method=getMetadata&format=xml&fl=*" # could also try &fl=owi
print(to_get) # uncomment to get the full request URI
headers = {"Accept":"application/xml"}
resp = requests.get(to_get, headers=headers, allow_redirects=True)
if resp.status_code == 200:
doc = libxml2.parseDoc(resp.text.encode("UTF-8", errors="ignore"))
ctxt = doc.xpathNewContext()
if ctxt.xpathEval("//@stat[.='overlimit']"):
print("over limit with %s" % xid)
sys.exit()
else:
try:
owi = ctxt.xpathEval("//@owi")[0].content
cleanowi = owi.replace('owi','')
return WORK_ID + cleanowi
except:
print("no owi found")
elif resp.status_code == 404:
msg = "Not found: %s%s" % (xid, os.linesep)
elif resp.status_code == 500:
msg = "Server error (%s)" % xid
else: # resp.status_code isn't 200, 404 or 500:
msg = " Response for %s was " % xid
msg += "%s%s" % (resp.status_code, os.linesep)
print(msg)
sleep(1)
if __name__ == "__main__":
mrxheader = """<?xml version="1.0" encoding="UTF-8" ?>
<collection xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">"""
fh = open('out/owi_tmp.xml', 'w+')
fh.write(mrxheader)
reader = pymarc.marcxml.parse_xml_to_array(infile)
for rec in reader:
for n in rec.get_fields('035'):
for s in n.get_subfields('a'):
if 'OCoLC' in s:
num = s.replace('(OCoLC)','')
workid = check_shelf(str(num))
if workid != None and workid != '':
field = pymarc.Field(
tag = '787',
indicators = ['0',' '],
subfields = [
'o', str(workid)
])
rec.add_field(field)
workid = ""
out = "%s" % (pymarc.record_to_xml(rec))
fh.write(out)
fh.write("</collection>")
fh.close()
# format output for readability
subprocess.Popen(['xmllint','--format','-o', outfile,'owi_tmp.xml'])