-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
159 lines (148 loc) · 4.53 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""
general utils for unravel project
- web get
- text parse
- hyperlink traverse
- word similarity
- text reconstruct
- job metadata
"""
import os
import logging
import logging.config
import redis
import HTMLParser
import wikipedia
import nltk
from nltk import tokenize
try:
import cPickle as pickle
except:
import pickle
logging.config.fileConfig('log.conf')
logger = logging.getLogger(__name__)
nltk.download('punkt')
def gen_disp_text(input_text=None):
"""
generate html safe text for display
"""
processed_text, siteurl = generate_unravelled_text(input_text=input_text, full_summary=[], prevlinked=[], qdepth=2)
# disp_text = processed_text.encode('ascii', 'xmlcharrefreplace')
html_parser = HTMLParser.HTMLParser()
disp_text = ""
for row in processed_text:
disp_text += html_parser.unescape(row)
# disp_text += "<br />"
return disp_text, siteurl
def generate_unravelled_text(input_text=None, qdepth=2, similarity=0.75, alength='summary', full_summary=[], prevlinked=[]):
"""
generate the full unravelled text
params:
- input_text, the topic to be searched for
- qdepth, the number of links to follow down the article tree
- similarity, the cosine distance minimum for sub-topic inclusion
- alength, the length of article to be returned - ["full","summary"]
"""
logger.info("beginning unravel process for %s, qdepth=%d" % (input_text, qdepth))
topicpage = webget(topic=input_text)
if topicpage is not None:
siteurl = topicpage.url
prevlinked.append(input_text.lower())
topicsummary = topicpage.summary
links = topicpage.links
for sentence in split_raw_text(topicsummary):
full_summary.extend([sentence, " "])
# tmp_summ += " "
current_depth = qdepth -1
if current_depth <= 0:
return [sentence, "<br /><br />"], siteurl # this prevents whole summaries from being returned
else:
for link in links:
if link.lower() in sentence.lower() and link.lower() not in prevlinked:
# doesn't get non identical link text, link value
if word_distance_check(link, input_text, similarity):
prevlinked.append(link.lower())
full_summary.extend(generate_unravelled_text(input_text=link, qdepth=current_depth, full_summary=[], prevlinked=prevlinked)[0])
links.remove(link)
return full_summary, siteurl
else:
return full_summary, None
def webget(topic=None):
"""
get the relevant wiki summary for topic or URL
"""
logger.debug("webget %s commencing" % topic)
try:
if topic is not None:
cg = cache_get(topic)
if cg is None:
logging.info("failed to find cache for %s" % topic)
topicpage = wikipedia.page(title=topic, preload=True) # preload causes open issue keyerr extlinks
logging.info("setting cache for %s" % topic)
cache_set(topic, topicpage)
return topicpage
else:
logging.info("found cache for %s" % topic)
return cg
except wikipedia.exceptions.DisambiguationError as err:
logger.info("Wikipedia disambig error: %s" % err)
# might eventually handle this better
return None
except wikipedia.exceptions as err:
logger.critical("Wikipedia error: %s" % err)
return None
def split_raw_text(raw):
"""
split raw text into sentences
"""
return tokenize.sent_tokenize(raw)
def word_distance_check(topic, testword, similarity):
"""
perform word2vec cosine distance of testword from topic
if distance greater than similarity, return False
"""
# connect to API service
return True
def redis_connect():
"""
look for remote or local redis cache and return connection object if found
"""
if os.environ.get("REDIS_URL", None) is not None:
return redis.from_url(os.environ.get("REDIS_URL"))
elif os.environ.get("REDIS_PORT", None) is not None: # easy check for docker-compose env
return redis.StrictRedis(host='redis', port=os.environ['REDIS_PORT'])
else:
return None
def cache_get(topic):
"""
checks the cache for topic and returns if found
"""
try:
r = redis_connect()
if r is not None:
logger.info("performing cache lookup for %s" % topic)
topicget = r.get(topic)
if topicget is not None:
return pickle.loads(topicget)
except ConnectionError as err:
logger.critical("Redis ConnectionError: %s" % err)
return None
def cache_set(topic, topicpage):
"""
saves topic data and links to cache
"""
try:
r = redis_connect()
if r is not None:
if r.set(name=topic, value=pickle.dumps(topicpage), ex=21600): # 6hr expiry
logging.info('successfully saved %s to cache' % topic)
else:
return None
except ConnectionError as err:
logger.critical("Redis ConnectionError: %s" % err)
return None
def stats():
"""
stats for this job
"""
return None