This repository has been archived by the owner on Apr 20, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
wsk.py
785 lines (696 loc) · 28.8 KB
/
wsk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
from pymongo import MongoClient
from bs4 import BeautifulSoup, element
from datetime import datetime, timedelta
from dateutil import parser as dateparser
from random import random
import base64
import calendar
import copy
import json
import requests
import time
import sys
class WSK:
def __init__(self, environment='', project_id=''):
self.environment = environment
self.project_id = project_id
self.auth_token = None
self.verbose = True
self.session_id = calendar.timegm(time.gmtime())
def set_db(self, dbname='wsk', uri='mongodb://localhost:27017'):
'''
Create a MongoDB connection
@param {str} dbname: the name of the db to use in Mongo
@param {str} uri: a mongodb uri that specifies the db location
'''
self.db = MongoClient(uri)[dbname]
def get_url(self, service, protocol='http'):
'''
Get the url for a query with the appropriate protocol and environment
@param {str} service: the service endpoint to which the query will be sent
@returns {str}: the fully-qualified url to which the request will be made
'''
return protocol + '://' + self.environment + '/wsapi/v1/services/' + service
def get_headers(self, request):
'''
Get the headers for a query with the right content length attribute
@param {str} request: an XML request object to be POST'ed to the WSK server
@returns {obj}: the headers to be used in a WSK request
'''
return {
'Host': self.environment,
'Content-Type': 'text/xml; charset=UTF-8',
'Content-Length': str(len(request)),
'SOAPAction': ''
}
def authenticate(self, username, password):
'''
Set the WSK's auth_token attribute by authenticating with the WSK servers
@param {str} username: the user's WSK username
@param {str} password: the user's WSK password
'''
request = '''
<SOAP-ENV:Envelope
xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"
SOAP-ENV:encodingStyle= "http://schemas.xmlsoap.org/soap/encoding/">
<soap:Body xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
<Authenticate xmlns="http://authenticate.authentication.services.v1.wsapi.lexisnexis.com">
<authId>{0}</authId>
<password>{1}</password>
</Authenticate>
</soap:Body>
</SOAP-ENV:Envelope>
'''.format(username, password)
url = self.get_url('Authentication', protocol='https')
response = requests.post(url=url, headers=self.get_headers(request), data=request)
try:
self.auth_token = BeautifulSoup(response.text, 'lxml').find('binarysecuritytoken').string
return self.auth_token
except AttributeError as e:
print(' * Authentication failure. Please verify your credentials and environment')
print(' * url: ', url)
print(' * response: ', response)
print(' * e: ', e)
sys.exit()
##
# Browse Sources
##
def get_all_sources(self):
'''
Get a list of the sources available to the current account. To do so, find
all source types, then descend down the tree of folders to find all
sources / leaf nodes. NB: Different folders have different depths one must
descend to find sources / leaf nodes.
'''
# populate list of sources available to account
sources = []
# find the top order folder
root_folders = self.browse_sources()
# use the first source grouping folder to recurse through the folder hierarchy
sub_folders = self.browse_sources(root_folders[0]['folder_id'])
# descend into all sub folders and add any newly discovered sub folders to this list
while sub_folders:
sub_folder = sub_folders.pop(0)
print(' * fetching sources in', sub_folder)
# some results will contain parent folders, others contain source / leaf nodes
result = self.browse_sources(sub_folder['folder_id'])
if 'source_id' not in result[0]:
sub_folders += result
else:
sources += result
return sources
def browse_sources(self, folder_id=''):
'''
Query for the sources to which the account has access. LexisNexis organizes
sources into "folders". Each folder contains subfolders, which contain source
information. To find all sources to which your account has access, one can choose
a folder_id, find all subfolders, then find all sources within each of those
subfolders and use the resulting set of sources.
@param {str} folder_id: The high order folder to search for sources. If a
folder_id is not provided, the query result will contain a list of available
folder names and folder ids.
@returns {obj} If the user did not provide a folder_id, obj will be a list
of objects, where each object has name and folder_id attributes that describe
a top-level folder.
If the user provided a folder_id and that folder_id has children folders, obj
will contain a list of objects, where each object has name and folder_id attributes
that describe a subfolder within the queried folder_id.
If the user provided a folder_id and that folder_id contains a list of sources,
obj will contain a list of objects where each object contains source_id,
type, name, and other metadata attributes.
'''
# assemble the folder argument to be passed to the soap request
folder_arg = '<folderId>{0}</folderId>'.format(folder_id) if folder_id else ''
# assemble the browse source query
request = '''
<SOAP-ENV:Envelope
xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"
SOAP-ENV:encodingStyle= "http://schemas.xmlsoap.org/soap/encoding/">
<soap:Body xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
<BrowseSources xmlns="http://browsesources.source.services.v1.wsapi.lexisnexis.com">
<locale>en-US</locale>
<binarySecurityToken>{0}</binarySecurityToken>
{1}
</BrowseSources>
</soap:Body>
</SOAP-ENV:Envelope>
'''.format(self.auth_token, folder_arg)
url = self.get_url('Source')
response = requests.post(url=url, headers=self.get_headers(request), data=request)
soup = BeautifulSoup(response.text, 'lxml')
results = []
# parse out the sources identified for this query
# nb: sources have differnet namespace prefixes
sources = []
for i in soup.find('sourcelist').findChildren():
if 'source' in i.name:
if 'sourceid' not in i.name and 'premiumsource' not in i.name:
sources.append(i)
# case where query result contains sources
if sources:
for i in sources:
results.append({
'name': find_tag_by_name(i, 'name').get_text(),
'source_id': int(find_tag_by_name(i, 'sourceid').get_text()),
'type': find_tag_by_name(i, 'type').get_text(),
'premium_source': find_tag_by_name(i, 'premiumsource').get_text(),
'has_index': bool(find_tag_by_name(i, 'hasindex').get_text()),
'has_toc': bool(find_tag_by_name(i, 'hastoc').get_text()),
'versionable': bool(find_tag_by_name(i, 'versionable').get_text()),
'is_page_browsable': bool(find_tag_by_name(i, 'ispagebrowsable').get_text()),
})
# case where query result contains folders
else:
for i in soup.find_all('folder'):
results.append({
'name': i.find('name').get_text(),
'folder_id': i.find('folderid').get_text()
})
return results
##
# Search Sources
##
def search_sources(self, query):
'''
@param: {str} query: a query for sources
@returns: {arr}: a list of source metadata objects that match the query
'''
request = '''
<SOAP-ENV:Envelope
xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"
SOAP-ENV:encodingStyle= "http://schemas.xmlsoap.org/soap/encoding/">
<soap:Body xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
<SearchSources xmlns="http://searchsources.source.services.v1.wsapi.lexisnexis.com">
<locale>en-US</locale>
<binarySecurityToken>{0}</binarySecurityToken>
<partialSourceName>{1}</partialSourceName>
</SearchSources>
</soap:Body>
</SOAP-ENV:Envelope>
'''.format(self.auth_token, query)
url = self.get_url('Source')
response = requests.post(url=url, headers=self.get_headers(request), data=request)
soup = BeautifulSoup(response.text, 'xml')
sources = []
for i in soup.find_all('source'):
combinable_list = []
for j in i.find_all('combinability'):
combinable_list.append(j.text)
sources.append({
'name': i.find('name').text,
'source_id': int(i.find('sourceId').text),
'type': i.find('type').text,
'premium_source': bool(i.find('premiumSource').text),
'has_index': bool(i.find('hasIndex').text),
'versionable': bool(i.find('versionable').text),
'is_page_browsable': bool(i.find('isPageBrowsable').text),
'combinability': combinable_list
})
return sources
##
# Get Source Details
##
def get_source_details(self, source_id):
'''
@param: {int} source_id: a source id for which details are requested
@returns: {arr}: a list of objects describing titles in the source id
'''
request = '''
<SOAP-ENV:Envelope
xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"
SOAP-ENV:encodingStyle= "http://schemas.xmlsoap.org/soap/encoding/">
<soap:Body xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
<GetSourceDetails xmlns="http://getsourcedetails.source.services.v1.wsapi.lexisnexis.com">
<binarySecurityToken>{0}</binarySecurityToken>
<sourceId>{1}</sourceId>
<includeSourceElement>true</includeSourceElement>
</GetSourceDetails>
</soap:Body>
</SOAP-ENV:Envelope>
'''.format(self.auth_token, source_id)
url = self.get_url('Source')
response = requests.post(url=url, headers=self.get_headers(request), data=request)
soup = BeautifulSoup(response.text, 'lxml')
sources = []
for i in soup.find('sourceguidelist').find_all('sourceguide'):
sources.append(self.parse_source_details(i))
return sources
def parse_source_details(self, soup):
'''
@param: {BeautifulSoup} soup: contains the sourceguide tag from a
get_source_details() query
@returns: {obj}: an object that details the titles in the current source
'''
source = base64.b64decode(soup.string)
source_soup = BeautifulSoup(source, 'lxml')
exclusions = source_soup.find('div', {'EXCLUSIONS'}).find_all('p')[3]
return dict({
'source_name': source_soup.find('div', {'class': 'PUBLICATION-NAME'}).text,
'file_name': source_soup.find('div', {'class': 'FILE-NAME'}).text,
'content_summary': source_soup.find('div', {'class': 'CONTENT-SUMMARY'}).text,
'full_text': self.split_on_br(source_soup.find('div', {'FULL-TEXT'})),
'selected_text': self.split_on_br(source_soup.find('div', {'SELECTED-TEXT'})),
'also_contains': self.split_on_br(source_soup.find('div', {'ALSO-CONTAINS'})),
'exclusions': self.split_on_br(exclusions),
})
def split_on_br(self, soup):
'''
@param: {BeautifulSoup}: contains a list of elements separated by <br/> tags
@returns: {arr}: a list of the elements in the soup
'''
elems = []
for i in soup.contents:
if getattr(i, 'name', None) != 'br':
if type(i) is element.Tag:
elems.append(i.string)
else:
elems.append(i)
return elems
##
# Search Method
##
def search(self,
query,
source_id,
get_text=True,
start_date='2017-12-01',
end_date='2017-12-02',
return_results=False,
save_results=True,
yield_results=False,
time_delta=30):
'''
Run a full query for the user, fetching all doc metadata and content
@param: {str} query: the user's document query phrase
@param: {int} source_id: the source id to which queries will be addressed
@param: {str} start_date: the starting query date in string format
@param: {str} end_date: the ending query date in string format
@param: {bool} yield_results: stream results to the parent function
@param: {bool} return_results: return matches to the parent function
@param: {bool} store_results: save matches to mongo
@param: {bool} get_text: fetch full text content for each match
@param: {bool} time_delta: time stride in days
@returns: {obj} an object with metadata describing search results data
'''
user_results = [] # results to return to user
per_page = 10 # results per page
start_date, end_date = self.get_search_dates(start_date, end_date)
query_start_date = start_date
query_end_date = start_date + timedelta(days=time_delta)
more_days_to_query = True
more_pages_to_query = True
while more_days_to_query:
# initialize pagination params for the new page
# query_begin and end marks the `begin` and `end` XML values for a query
query_begin = 1
query_end = per_page
end = float('inf')
while more_pages_to_query or more_days_to_query:
start_date_str = self.date_to_string(query_start_date)
end_date_str = self.date_to_string(query_end_date)
query_result = self.run_search(query, source_id, begin=query_begin,
end=query_end, start_date=start_date_str, end_date=end_date_str,
save_results=save_results, get_text=get_text)
# case where query returned no results
if query_result['total_matches'] == 0:
more_pages_to_query = False
# case where there are more dates to cover
if query_end_date < end_date:
# slide the date window forward and reset the pagination values
query_start_date = query_start_date + timedelta(days=time_delta)
query_end_date = query_start_date + timedelta(days=time_delta)
query_begin = 1
query_end = per_page
else:
more_days_to_query = False
# only append to results in RAM if necessary
if return_results: user_results += query_result['results']
if yield_results: yield query_result['results']
# update the total number of matches to fetch (=inf on error & start)
end = float(query_result['total_matches'])
# validate whether the request succeeded or errored
if query_result['status_code'] == 200:
# continue paginating over responses for the current date range
if query_end < end:
query_begin += per_page
query_end += per_page
# pagination is done, check whether to slide the date window forward
else:
more_pages_to_query = False
# case where there are more dates to cover
if query_end_date < end_date:
# slide the date window forward and reset the pagination values
query_start_date = query_start_date + timedelta(days=time_delta)
query_end_date = query_start_date + timedelta(days=time_delta)
query_begin = 1
query_end = per_page
# also potentially increment the time delta for longer strides
if query_result['total_matches'] < (per_page/2): time_delta += 1
# we're done!
else: more_days_to_query = False
# the request failed, so decrement time_delta or flail
else:
if time_delta > 1:
time_delta -= 1
else: print(' * Abort!')
if return_results:
yield user_results
def run_search(self,
query,
source_id,
begin=1,
end=10,
start_date='2017-12-01',
end_date='2017-12-02',
save_results=True,
get_text=True):
'''
Method that actually submits search requests. Called from self.search(),
which controls the logic that constructs the individual searches
@param: {str} query: the user's document query phrase
@param: {int} source_id: the source id to which queries will be addressed
@param: {int} begin: the starting result number to return
@param: {int} end: the ending result number to return
@param: {str} start_date: the starting query date in string format
@param: {str} end_date: the ending query date in string format
@param: {bool} save_results: save matches to mongo
@param: {bool} get_text: fetch full text content for each match
@returns: {obj} an object with metadata describing search results data
'''
print(' * querying for', query, source_id, begin, end, start_date, end_date)
request = '''
<SOAP-ENV:Envelope
xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"
SOAP-ENV:encodingStyle= "http://schemas.xmlsoap.org/soap/encoding/">
<soap:Body xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
<Search xmlns="http://search.search.services.v1.wsapi.lexisnexis.com">
<binarySecurityToken>{0}</binarySecurityToken>
<sourceInformation>
<sourceIdList xmlns="http://common.search.services.v1.wsapi.lexisnexis.com">
<sourceId xmlns="http://common.services.v1.wsapi.lexisnexis.com">{1}</sourceId>
</sourceIdList>
</sourceInformation>
<query>{2}</query>
<projectId>{3}</projectId>
<searchOptions>
<sortOrder xmlns="http://common.search.services.v1.wsapi.lexisnexis.com">Date</sortOrder>
<dateRestriction xmlns="http://common.search.services.v1.wsapi.lexisnexis.com">
<startDate>{4}</startDate>
<endDate>{5}</endDate>
</dateRestriction>
</searchOptions>
<retrievalOptions>
<documentView xmlns="http://result.common.services.v1.wsapi.lexisnexis.com">Cite</documentView>
<documentMarkup xmlns="http://result.common.services.v1.wsapi.lexisnexis.com">Display</documentMarkup>
<documentRange xmlns="http://result.common.services.v1.wsapi.lexisnexis.com">
<begin>{6}</begin>
<end>{7}</end>
</documentRange>
</retrievalOptions>
</Search>
</soap:Body>
</SOAP-ENV:Envelope>
'''.format(self.auth_token, source_id, query, self.project_id,
start_date, end_date, begin, end)
url = self.get_url('Search')
response = requests.post(url=url, headers=self.get_headers(request), data=request)
soup = BeautifulSoup(response.text, 'lxml')
result_packet = {}
result_packet['status_code'] = response.status_code
result_packet['total_matches'] = 0
result_packet['results'] = []
try:
result_count_tag = find_tag_by_name(soup, 'documentsfound')
result_packet['total_matches'] = int(result_count_tag.get_text())
except AttributeError:
result_packet['total_matches'] = 0
if (result_packet['total_matches'] == 0) or (result_packet['status_code'] != 200):
return result_packet
else:
result_packet['results'] = self.get_documents(soup, get_text)
if save_results: self.save_results(result_packet['results'])
return result_packet
def save_results(self, results):
'''
Save all search results to the database
@param: {arr} results: a list of search result objects
'''
if not self.db:
raise Exception('Please call set_db() before saving records')
return
if not results: return
composed_results = []
copied = copy.deepcopy(results)
for i in copied:
i['session_id'] = self.session_id
i['project_id'] = self.project_id
composed_results.append(i)
self.db.results.insert_many(composed_results)
def get_search_dates(self, start_date, end_date):
'''
@param {str} start_date: the starting date for the query: '2017-12-01'
@param {str} end_date: the ending date for the query: '2017-12-02'
@returns datetime, datetime: the start and end dates as datetime objects
'''
return self.string_to_date(start_date), self.string_to_date(end_date)
def string_to_date(self, string_date):
'''
@param: {str} string_date: a date in string format: '2017-12-01'
@returns: {datetime}: the input date in datetime format
'''
year, month, day = [int(i) for i in string_date.split('-')]
return datetime(year, month, day)
def date_to_string(self, datetime_date):
'''
@param: {datetime}: a datetime object
@returns: {str}: the input datetime in string format: 'YYYY-MM-DD'
'''
return datetime_date.strftime('%Y-%m-%d')
def get_documents(self, soup, get_text=True):
'''
@param: {BeautifulSoup}: the result of a search() query
@returns: {arr}: a list of objects, each describing a match's metadata
'''
# create a store of processed documents
docs = []
# find list of document containers
doc_containers = []
for i in soup.findChildren():
if 'documentcontainer' in i.name and 'documentcontainerlist' not in i.name:
doc_containers.append(i)
for idx, i in enumerate(doc_containers):
try:
doc = Document(i).metadata
if get_text:
doc['full_text'] = self.get_full_text(doc['doc_id'])
docs.append(doc)
except Exception as exc:
print(' ! could not process doc', idx, exc)
return docs
##
# Get Full Text Content
##
def get_full_text(self, document_id):
'''
@param: {int}: a document's id number
@returns:
'''
request = '''
<SOAP-ENV:Envelope
xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"
SOAP-ENV:encodingStyle= "http://schemas.xmlsoap.org/soap/encoding/">
<soap:Body xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
<GetDocumentsByDocumentId xmlns="http://getdocumentsbydocumentid.retrieve.services.v1.wsapi.lexisnexis.com">
<binarySecurityToken>{0}</binarySecurityToken>
<documentIdList>
<documentId>{1}</documentId>
</documentIdList>
<retrievalOptions>
<documentView>FullText</documentView>
<documentMarkup>Display</documentMarkup>
</retrievalOptions>
</GetDocumentsByDocumentId>
</soap:Body>
</SOAP-ENV:Envelope>
'''.format(self.auth_token, document_id)
url = self.get_url('Retrieval')
response = requests.post(url=url, headers=self.get_headers(request), data=request)
soup = BeautifulSoup(response.text, 'xml')
return base64.b64decode(soup.document.text).decode('utf8')
class Document(dict):
def __init__(self, document_soup):
self.verbose = False
self.include_meta = False
self.metadata = self.format_doc(document_soup)
def format_doc(self, soup):
'''
@param {BeautifulSoup} soup: contains a document from a search() query:
<ns1:documentcontainer>
<ns1:documentid>02A6A252C52</ns1:documentid>
<ns1:document>PD94bWwgdmVyc2lvbj0i</ns1:document>
</ns1:documentcontainer>
Here the <documentid> contains the doc's id and <document> contains a
base64 encoded representation of the doc's metadata
@returns: {obj}: an object with metadata attributes from the decoded doc
'''
formatted = {}
decoded = base64.b64decode(soup.find('ns1:document').get_text())
doc_soup = BeautifulSoup(decoded, 'lxml')
if self.include_meta:
for i in doc_soup.find_all('meta'):
try:
formatted[ i['name'] ] = i['content']
except Exception as exc:
if self.verbose: print(' ! error formatting doc', i['name'], exc)
formatted['doc_id'] = soup.find('ns1:documentid').get_text()
formatted['headline'] = self.get_doc_headline(doc_soup)
formatted['attachment_id'] = self.get_doc_attachment_id(doc_soup)
formatted['pub'] = self.get_doc_pub(doc_soup)
formatted['pub_date'] = self.get_doc_pub_date(doc_soup)
formatted['length'] = self.get_doc_length(doc_soup)
formatted['section'] = self.get_doc_section(doc_soup)
formatted['author'] = self.get_doc_author(doc_soup)
return formatted
##
# Document attribute accessors
##
def get_doc_headline(self, soup):
'''
@param {BeautifulSoup} soup: the soup from a documentcontainer tag
@returns {str} the headline from a document
'''
try:
headline = soup.find('div', {'class': 'HEADLINE'}).string
if headline:
return headline
except Exception as exc:
headline = soup.find('h1').string
if headline:
return headline
else:
if self.verbose: print(' ! error parsing headline', exc)
return ''
def get_doc_attachment_id(self, soup):
'''
@param {BeautifulSoup} soup: a documentcontainer tag
@returns {str}: the attachmentId attribute of a document
'''
try:
attachment_node = soup.find('span', {'class': 'attachmentId'})['id']
return attachment_node if attachment_node else ''
except Exception as exc:
if self.verbose: print(' ! error parsing doc_attachment', exc)
return ''
def get_doc_pub(self, soup, default_name='No pub name'):
'''
@param {BeautifulSoup} soup: a documentcontainer tag
@returns {str}: the publication attribute of a document
'''
try:
pub = soup.find('div', {'class': 'PUB'}).string
if pub:
return pub
except Exception as exc:
pub = soup.find('meta', {'name': 'sourceName'})['content']
if pub:
return pub
else:
if self.verbose: print(' ! error parsing doc_pub', exc)
return default_name
def get_doc_pub_date(self, soup):
'''
Parses different human-readable date formats dynamically,
e.g.:
January 3, 2017 Tuesday 5:00 PM GMT
and returns a date in UTC+Z format using the format,
e.g.:
2017-01-03T17:00:00Z
@param {BeautifulSoup} soup: a documentcontainer tag
@returns {str}: the pub date attribute from a document
'''
bad_date = '1900-01-01T00:00:00Z'
try:
soup_date = soup.find('div', {'class': 'PUB-DATE'})
if not soup_date:
soup_date = soup.find('div', {'class': 'DATE'})
if not soup_date:
soup_date = soup.find('div', {'class': 'DISPLAY-DATE'})
date_str = soup_date.get_text()
print("date_str: ", date_str)
date = ''
while not date:
try:
date = dateparser.parse(date_str)
print(' parsed: ', date)
except Exception as exc:
print(' ! error parsing doc_pub_date', exc)
date_str = ' '.join(date_str.split(' ')[:-1])
date_out = date.strftime('%Y-%m-%dT%H:%M:%SZ')
if not date_out:
date_out = bad_date
print(date_out)
return date_out
except Exception as exc:
self.verbose = True
if self.verbose: print(' ! error parsing doc_pub_date', exc)
return ''
def get_doc_length(self, soup):
'''
@param {BeautifulSoup} soup: a documentcontainer tag
@returns {str}: the length attribute of a document
'''
try:
length = soup.find('div', {'class': 'LENGTH'}).string
length = length.replace(' words', '')
return length if length else ''
except Exception as exc:
if self.verbose: print(' ! error parsing doc_length', exc)
return ''
def get_doc_section(self, soup):
'''
@param {BeautifulSoup} soup: a documentcontainer tag
@returns {str}: the newspaper section attribute of a document
'''
try:
section = soup.find('div', {'class': 'SECTION'}).string
if section:
return section
except Exception as exc:
# for some reason this only works if this option is brought down to the except portion of try except.
# compare to get_doc_pub, for instance, which works without this structure.
# but in this case, if soup fails to find anything in <div class = "SECTION">, it craps out and goes to except, returning "No pub section."
# moving the second option of where to find section info into except fixes this.
section = soup.find('div', {'class': 'SECTION-INFO'}).string
if section:
return section
else:
if self.verbose: print(' ! error parsing doc_section', exc)
return 'No pub section'
def get_doc_author(self, soup):
'''
@param {BeautifulSoup} soup: a documentcontainer tag
@returns {str}: the author attribute of a document
'''
try:
author = soup.find('div', {'class': 'BYLINE'}).string
author = author.replace('By ', '')
if author:
return author
except Exception as exc:
if self.verbose: print(' ! error parsing doc_author', exc)
return ''
##
# Helpers
##
def find_tag_by_name(soup, tag_name):
'''
Given a BeautifulSoup object and a tag name, return the first
tag whose name contains `tag_name`
@param {BeautifulSoup} soup: a BeautifulSoup object
@param {str} tag_name: the name of the tag to find
@returns {BeautifulSoup} the first child whose name contains `tag_name`
'''
for tag in soup.findChildren():
if tag_name in tag.name:
return tag
return None