-
Notifications
You must be signed in to change notification settings - Fork 0
/
publications_query.py
217 lines (161 loc) · 6.34 KB
/
publications_query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import collections
import datetime
import io
import json
import os
import re
import smtplib
import subprocess
import xlsxwriter
from email import encoders
from email.message import Message
from email.mime.base import MIMEBase
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import config
from ads_queries import ADSQueries
def ignore_tmp_bibcodes(bibcodes):
"""Filter out bibcodes which are temporary.
Params:
-------
bibcodes: list of str
Bibcodes to filter.
Returns:
--------
list of str
List of bibcodes lswithout the temporary ones.
"""
return [bibcode for bibcode in bibcodes if 'tmp' not in bibcode]
def previously_found_bibcodes():
"""Return the list of bibcodes found in previous searches.
The previously found publications are stored as bibcode in a file, whose path is set as PREVIOUS_BIBCODES_FILE in
the configuration file. If this file doesn't exist, it is created first. If it cannot be created, an exception is
raised.
Returns:
--------
list
List of publications found in a previous search.
"""
# create the file for storing previously found bibcodes, if need be
_ensure_previously_recorded_file_exists()
with open(config.PREVIOUS_BIBCODES_FILE) as f:
return [line.strip() for line in f.readlines()]
def _ensure_previously_recorded_file_exists():
"""Create the file for storing previously found bibcodes, if it doesn't exist yet.
If the file cannot be created an exception is raised.
"""
if not os.path.isfile(config.PREVIOUS_BIBCODES_FILE):
with open(config.PREVIOUS_BIBCODES_FILE, 'w') as f:
f.close()
# check whether the file could be created
if not os.path.isfile(config.PREVIOUS_BIBCODES_FILE):
raise IOError('The file for storing previously found bibcodes could not be created: ' +
config.PREVIOUS_BIBCODES_FILE)
def publications_spreadsheet(publications, columns):
"""Create a spreadsheet with the publications.
Params:
-------
publications: list
Publications.
columns: list of str
Columns to include in the spreadsheet.
Returns:
--------
io.BytesIO:
Spreadsheet with the publication details.
"""
out = io.BytesIO()
workbook = xlsxwriter.Workbook(out)
worksheet = workbook.add_worksheet()
for row, p in enumerate(publications):
for col, c in enumerate(columns):
worksheet.write(row, col, p.get(c, ''))
workbook.close()
# rewind the buffer
out.seek(0)
return out
def spreadsheet_columns():
"""Return an ordered dictionary of the columns for a spreadsheet.
The dictionary keys are the keys to use for accessing details in a publication dictionary, the values are more
human-friendly column names.
Some of the keys may not be included in a publication dictionary.
Returns:
--------
collections.OrderedDict:
Dictionary of column keys and names.
"""
columns = collections.OrderedDict()
columns['record_type'] = 'Record Type'
columns['publication_number'] = 'Doc/Publication Number'
columns['author'] = 'Responsibility'
columns['title'] = 'Title'
columns['pub'] = 'Journal'
columns['volume'] = 'Volume'
columns['issue'] = 'Issue'
columns['page'] = 'Page'
columns['refereed'] = 'Refereed'
columns['bibcode'] = 'Bibcode'
columns['doi'] = 'DOI'
columns['ads_url'] = 'ADS URL'
columns['abstract'] = 'Abstract'
columns['telescopes'] = 'Telescopes'
columns['keywords'] = 'Keywords'
return columns
def send_mails(spreadsheets, columns):
column_explanation = '\n'.join([chr(ord('A') + i) + ' - ' + columns[key] + '<br>'
for i, key in enumerate(columns.keys())])
outer = MIMEMultipart()
outer['Subject'] = 'Publications Query Results'
outer['To'] = ', '.join(config.LIBRARIAN_EMAIL_ADDRESSES)
outer['From'] = config.FROM_EMAIL_ADDRESS
outer.preamble = 'You will not see this in a MIME-aware mail reader.\n'
body = MIMEMultipart('alternative')
html = '''<p>Dear Librarian,</p>
<p>Please find attached the results for the publications query.</p>
<p>{column_explanation}</p>
<p>Kind regards,</p>
<p>Your Friendly Publications Query Script</p>'''.format(column_explanation=column_explanation)
text = re.sub('<[^>]+>', '', html)
body.attach(MIMEText(text, 'plain'))
body.attach(MIMEText(html, 'html'))
outer.attach(body)
for spreadsheet in spreadsheets:
msg = MIMEBase('application', 'application/vnd.ms-excel')
read = spreadsheet['content'].read()
msg.set_payload(read)
encoders.encode_base64(msg)
msg.add_header('Content-Disposition', 'attachment', filename=spreadsheet['name'])
outer.attach(msg)
with smtplib.SMTP('smtp.saao.ac.za') as s:
s.sendmail(config.FROM_EMAIL_ADDRESS, config.LIBRARIAN_EMAIL_ADDRESSES, outer.as_string())
def run_pub(start, end):
# d = datetime.date(2017, 6, 15)
queries = ADSQueries(from_date=start, to_date=end)
by_keywords = queries.by_keywords(config.KEYWORDS)
by_authors = queries.by_authors(config.AUTHORS.keys())
#by_affiliations = queries.by_affiliations(config.AFFILIATIONS)
all = {**by_keywords, **by_authors}#, **by_affiliations}
# make sure the keywords are present
for b in by_keywords:
all[b]['keywords'] = by_keywords[b]['keywords']
# now that we have collected everything, we can flatten our map to a list
publications = [all[b] for b in all.keys()]
publications.sort(key=lambda p: p['bibcode'])
# add URL to ADS page
for p in publications:
p['ads_url'] = 'https://ui.adsabs.harvard.edu/#abs/{0}/abstract'.format(p['bibcode'])
# add refereed status
for p in publications:
p['refereed'] = 'REFEREED' in p['property']
# make some content more amenable to humans and xslxwriter alike
for p in publications:
list_value_columns = ['author', 'title', 'doi', 'keywords', 'page']
for c in list_value_columns:
if c in p and p[c]:
p[c] = ', '.join(p[c])
columns = spreadsheet_columns()
send_mails([
dict(name='all.xlsx', content=publications_spreadsheet(publications, columns.keys()))
], columns)
#gitt