-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautoCollectionsFunctions.py
181 lines (146 loc) · 8.5 KB
/
autoCollectionsFunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import pandas as pd
import requests
import xml.etree.ElementTree as ElementTree
import re
import config
def getText(elem):
'''
Get the text from the XML response and return a string.
'''
try:
msg = elem.text
msg = msg.replace('<br>', '')
except:
msg = ""
if msg is None:
msg = "not available"
return msg
def searchOpenURL(row):
'''(pandas.DataFrame) --> pandas.Series
This function takes a row of a pandas DataFrame and gets the ISSN of a journal. When used with the pandas apply function, this function uses the ISSNs to run a HTTP query against a library OpenURL link resolver server, retrieves the XML response, and parses out the package name and coverage dates. Using getText, this function returns two columns in a panda Series for each row: a statement of availability, and a statement of coverage (package names and the dates they cover).
'''
# Create and run an HTTP request against the open URL link resolver
#r = requests.get('https://ca01.alma.exlibrisgroup.com/view/uresolver/01UTON_UW/openurl?svc_dat=CTO&issn={}'.format(row['q_issn']))
r = requests.get('{}svc_dat=CTO&issn={}'.format(config.base_URL, row['q_issn']))
# Parse the XML response and store it as root
root = ElementTree.fromstring(r.content)
# Create a dict of namespace values for use later on, so that the queries of the stored XML response can be cleaner
ns = {'resolver': 'http://com/exlibris/urm/uresolver/xmlbeans/u'}
# Create an empty dict that will be used to store the coverage statements for each journal. Key will be the package name, value will be the coverage dates.
coverage_statement = {}
# get all full-text services
# if there is a full-text service
if root.findall('.//resolver:context_service[@service_type="getFullTxt"]',ns) != []:
# set the availability statement to show that there is a full-text
avail_statement = 'Full-text available'
print('Full-text available for ' + row['q_issn'])
# for each full-text service
for service in root.findall('.//resolver:context_service[@service_type="getFullTxt"]',ns):
# Create empty str variables to store the details of the full-text service
servicePackageName = ''
serviceCoverage = ''
# get package name of the full-text service and add it to the temporary str variable
package = service.find('.//resolver:key[@id="package_public_name"]',ns)
servicePackageName = getText(package)
# get coverage date statement of the full-text service and add it to the temptorary str variable
avail = service.find('.//resolver:key[@id="Availability"]',ns)
serviceCoverage = getText(avail)
# Add the details of this full-text service to the dict
coverage_statement[servicePackageName] = serviceCoverage
# When there is no full-text service
else:
# set the availability statement to show that there is no full-text
avail_statement = 'No full-text available'
print('Full-text not available for ' + row['q_issn'])
# Return the availability and coverage statements as a pandas Series
return pd.Series([avail_statement, coverage_statement])
def coverageStatement_availParser(row):
'''
(pd.Series) -> pd.Series
This function parses out info from the coverage statements for all packages, and updates the availability statements for the journals to reflect those journals that don't have full-text coverage, those that do up to the present, those with embargo and those with full-text access, but not to the present.
'''
# Create an empy str for the coverage statement value
avail_statement = ''
# Only do run this function if there are full-text resources
if row['coverage'] != {}:
# Create an empty variable that will change if the function should stop
stop = 0
# Check all coverage statements in the dict, and if any ONE of them doesn't contain the words 'most recent' or 'until' (i.e., its up to the current), set the availability statement to available to present and stop.
for value in row['coverage'].values():
# Skip values that don't contain any data
if value != '':
if not any(s in value for s in ('Most recent', 'until')):
avail_statement = 'Full-text available to present'
stop = 1
break
# If there was no coverage statement where there was full-text to the present, continue
if stop == 0:
for value in row['coverage'].values():
if value != '':
# If there is any ONE line coverage statement that is for an embargo
if 'Most recent' in value:
avail_statement = 'Full-text available with embargo'
stop = 1
break
# If there is no statement up to the present, nor for an embargo, then it must be available, but not complete.
if stop == 0:
for value in row['coverage'].values():
if value != '':
if 'until' in value:
avail_statement = 'Full-text available, but not complete'
else:
avail_statement = 'No full-text available'
return pd.Series([avail_statement])
def coverageStatement_yearsParser(row):
'''
(pd.Series) -> pd.Series
This function takes the coverage statements provided by the link resolver, parses out the dates, and creates a one line date range of coverage.
'''
tempList = []
# Don't do this if there are no coverage statements
if row['coverage'] != {}:
# iterate over every statement in the dict
for value in row['coverage'].values():
# Only do this if the coverage statement has data on the date coverage
if 'Available from' in value:
# For coverage statements to the present
if not any(s in value for s in ('Most recent', 'until')):
tempList.append((re.search(r"Available from (\d{4})", value).group(1)) + ' - present')
# For coverage statements with an embargo
if 'Most recent' in value:
tempList.append((re.search(r"Available from (\d{4})", value).group(1)) + ' - ' + re.search(r'Most recent (.*?)\(s\)', value).group(1) + ' ago')
# For coverage statements that aren't current to the present
if 'until' in value:
tempList.append((re.search(r"Available from (\d{4})", value).group(1)) + ' - ' + (re.search(r"until (\d{4})", value).group(1)))
return pd.Series([tempList])
def coverageStatement_yearsParser2(row):
'''
(pd.Series) -> pd.Series
This function takes the coverage statements provided by the link resolver, parses out the dates, and creates a one line date range of coverage.
'''
tempList = []
# Don't do this if there are no coverage statements
if row['coverage'] != {}:
# iterate over every statement in the dict
for value in row['coverage'].values():
# Only do this if the coverage statement has data on the date coverage
if 'Available from' in value:
# For coverage statements to the present
if not any(s in value for s in ('Most recent', 'until')):
try:
tempList.append((re.search(r"Available from (\d{4})", value).group(1)) + ' - present')
except AttributeError:
tempList.append('?')
# For coverage statements with an embargo
if 'Most recent' in value:
try:
tempList.append((re.search(r"Available from (\d{4})", value).group(1)) + ' - ' + re.search(r'Most recent (.*?)\(s\)', value).group(1) + ' ago')
except AttributeError:
tempList.append('?')
# For coverage statements that aren't current to the present
if 'until' in value:
try:
tempList.append((re.search(r"Available from (\d{4})", value).group(1)) + ' - ' + (re.search(r"until (\d{4})", value).group(1)))
except AttributeError:
tempList.append('?')
return pd.Series([tempList])