-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathscrape_chat.py
371 lines (323 loc) · 13.8 KB
/
scrape_chat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
"""
Simple client for fetching latest activity in a chat room
"""
from __future__ import annotations
from datetime import datetime, timedelta
from time import sleep
import platform
import logging
from typing import Optional, Union
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
# For debugging issue #103
requests.packages.urllib3.add_stderr_logger()
class TranscriptException(Exception):
"""
Base class for transcript errors
"""
...
class TranscriptFrozenDeletedException(TranscriptException):
"""
Exception thrown when a room is frozen or deleted
"""
...
class SloshyClient:
def __init__(self):
session = requests.Session()
session.headers['User-Agent'] = self._get_ua()
session.mount(
"https://", HTTPAdapter(
max_retries=Retry(
total=5,
backoff_factor=1,
status_forcelist=[429])))
self.session = session
def _get_ua(self):
"Put this in a separate method so that subclasses can easily override"
return "SloshyBot/0.1 (+%s) Python/%s Requests/%s" % (
"https://github.com/tripleee/sloshy",
platform.python_version(),
requests.__version__)
def _get_requests_soup(self, url: str) -> BeautifulSoup:
"""
Simple wrapper to set the User-Agent: correctly and fetch a page,
then return the BeautifulSoup parse of the result.
In the case of a 429 error, sleep for a bit and try again.
"""
result = self.session.get(url)
result.raise_for_status()
return BeautifulSoup(result.text, 'html.parser')
class Transcript(SloshyClient):
"""
Simple wrapper for fetching the transcript of a room.
"""
def fetch(
self, server: str, room: int,
fallback_sleep: int = 1
) -> tuple[BeautifulSoup, str]:
"""
Generator to retrieve increasingly old room transcripts from server
and yield a tuple with the contents as a BeautifulSoup object, and the
URL it was fetched from.
Optionally sleep before fetching an older page; the fallback_sleep
integer argument controls this.
"""
url = "https://%s/transcript/%i" % (server, room)
while True:
logging.info('Fetching %s', url)
soup = self._get_requests_soup(url)
yield soup, url
trans = soup.body.find("div", {"id": "transcript"})
assert trans is not None
# assert trans.text.strip() == "no messages today"
main = soup.body.find("div", {"id": "main"})
prev = main.find("a", {"rel": "prev"})
if not prev:
break
url = 'https://%s%s' % (server, prev['href'])
logging.info('Fetch continues after sleep at %s', url)
sleep(fallback_sleep)
def messages(self, server: str, room: int) -> dict:
"""
Generator to retrieve increasingly older messages from the room's
transcript.
Yield a dict with a representation of the extracted message.
{'server': server, 'room': room, 'url': url, 'when': datetime,
'user': {'name': str, 'id': int}, 'msg': str, 'link': str}
The 'msg' member is the actual message; the 'link' is its permalink.
The 'server' and 'room' members are simply copied from the input.
The 'url' is the address of the transcript page we fetched and scraped.
"""
for soup, url in self.fetch(server, room):
title = soup.title.string
assert ' - ' in title
# Title sometimes contains ' (page 1 of 2)' after date
title = title.split(' (page ')[0]
datestr = title.rsplit(' - ', 1)[-1].strip()
date = None
try:
date = datetime.strptime(datestr, '%Y-%m-%d')
except ValueError:
logging.warning("Failed to parse date %r", datestr)
raise
monologue = soup.body.find_all("div", {"class": "monologue"})
if not monologue:
logging.warning("No monologue found in %s", url)
logging.debug(soup.body)
continue
for message in reversed(monologue):
when = message.find("div", {"class": "timestamp"})
time = None
if when:
try:
time = datetime.strptime(when.text, "%I:%M %p").time()
except ValueError:
logging.warning("Failed to parse time %r", when.text)
if time is None:
time = datetime(1970, 1, 1).time()
userdiv = message.find("div", {"class": "username"})
try:
# <div class="username">
# <a href="/users/3735529/smokedetector"
# title="SmokeDetector">SmokeDetector</a></div>
user = userdiv.a['href'].split('/')
username = user[-1]
userid = int(user[-2])
except TypeError:
# <div class="username">user12716323</div>
username = userdiv.text
if username.startswith('user'):
try:
userid = int(username[4:])
except ValueError:
userid = 0
result = {
"server": server,
"room": room,
"url": url,
"when": datetime.combine(date, time),
"user": {
"name": username,
"id": userid
},
"msg": message.find(
"div", {"class": "content"}).text.strip(),
"link": url
}
logging.debug("yielding message %r", result)
yield result
def check_frozen_or_deleted(self, server: str, room: int) -> bool:
"""
Raise an exception if the room's info page says it's frozen or deleted
"""
# https://chat.stackexchange.com/rooms/info/140197/hnq-operations-research?tab=feeds
# https://chat.stackexchange.com/rooms/info/111121/modeling-questions-related-to-chemistry?tab=feeds
url = "https://%s/rooms/info/%i/?tab=feeds" % (server, room)
soup = self._get_requests_soup(url)
if 'Currently no feeds are being posted into this room.' in soup.text:
return False
for candidate in ('Because this room is deleted, no feeds are'
' being posted into this room.',
'Because this room is frozen, no feeds are'
' being posted into this room.'):
if candidate in soup.text:
raise TranscriptFrozenDeletedException(candidate)
return False
def usercount(
self,
room: int,
server: str,
userlimit: int=1,
messagelimit: int=1,
skip_system_messages: bool=True,
sloshy_id: int|None = None,
sloshy_interval: timedelta = timedelta(days=7),
sloshy_max_interval: timedelta = timedelta(days=15)
) -> list[dict] | None:
"""
Fetch latest messages from room on server. Stop when the unique
user count reaches the limit, though always return at least as
many messages as specified by messagelimit, if available.
Optionally, skip system messages (where the user id < 0);
if sloshy_id is not None, it should be the id of the Sloshy user
as an int; and then, the fetch is considered successful if two
adjacent messages from this user are found with at least the amount
of time between them indicated by sloshy_interval (default 7 days),
but less than sloshy_max_interval (default 15 days).
Return the sequence of messages as a list of dicts, newest first,
each as described in the `messages` method's docstring; or, if
the conditions could not be satisfied, return None.
"""
assert isinstance(room, int)
room = int(room)
self.check_frozen_or_deleted(server, room)
messages: list[dict] = []
# Logically previous, but chronologically next, as we are traversing
# the transcript backwards
next_sloshy_msg = None
users = set()
for message in self.messages(server, room):
userid = message['user']['id']
if userid < 0 and skip_system_messages:
continue
messages.append(message)
if sloshy_id is not None and userid == sloshy_id:
if next_sloshy_msg is not None:
sloshy_delta = next_sloshy_msg["when"] - message["when"]
logging.debug(
"Next Sloshy message at %s",
next_sloshy_msg["when"])
logging.debug(
"Current Sloshy message at %s", message["when"])
if sloshy_delta > sloshy_interval and \
sloshy_delta < sloshy_max_interval:
logging.info(
"Room %i: %s: delta to next Sloshy message is %s",
room, message["when"], sloshy_delta)
return messages
else:
logging.debug("Delta %s", sloshy_delta)
next_sloshy_msg = message
else:
next_sloshy_msg = None
if userid not in users:
logging.debug("room %i: found user %i", room, userid)
users.add(userid)
if len(users) >= userlimit and len(messages) >= messagelimit:
logging.info(
"went back to %s, got %i messages, %i users",
message["url"], len(messages), len(users))
return messages
logging.debug(
"room %i: users %s, %i/%i messages", room, users,
len(messages), messagelimit)
logging.warning(
"went back to %s, only found %i messages, %i users",
message["url"], len(messages), len(users))
return None
def latest(
self,
room: int,
server: str,
sloshy_id: int|None = None
) -> dict:
"""
Fetch latest message from room on server. Return a dict like
the one produced by the `messages` method.
If sloshy_id is not None, it should be the id of the Sloshy user,
and the user id will be passed to self.usercount().
Skip any negative user id:s, as those are feed messages which do
not count as actual activity.
"""
kwargs = {"userlimit": 1, "messagelimit": 1}
if sloshy_id is not None:
logging.debug("latest: room %i; sloshy_id %i", room, sloshy_id)
kwargs["sloshy_id"] = sloshy_id
return self.usercount(room, server, **kwargs)[0]
def search(
self, server: str, room: int, user: int, phrase: str
) -> Optional[str]:
"""
Search room on server for posts by user containing phrase.
Return the URL where the newest post was found, or None if it wasn't.
"""
logging.info(
'Searching for %s by user %i in room %s:%i',
phrase, user, server, room)
url = 'https://%s/search?q=%s&user=%i&room=%i' % (
server, requests.utils.quote(phrase, safe=''), user, room)
found = self._get_requests_soup(url)
content = found.body.find("div", {"id": "content"})
if '\n0 messages found' in content.text:
return None
messages = content.find("div", {"class": "messages"})
return 'https://%s%s' % (server, messages.find("a")['href'])
class RepScrapeException(Exception):
"""
Exception for RepScrape failure
"""
...
class RepScrape(SloshyClient):
def __init__(self, user: Union[int, str]):
super().__init__()
url = f"https://stackexchange.com/users/{user}/sloshy?tab=accounts"
soup = self._get_requests_soup(url)
accounts = soup.body.find_all("div", {"class": "account-container"})
sites = {}
sitemax = {}
for account in accounts:
site = account.find(
"div", {"class": "account-site"}).h2.a['href'].split('/')[2]
score = int(account.find("span", {"class": "account-number"}).text)
sites[site] = score
logging.debug("found site %s: score %i", site, score)
topsite = ".".join(site.split(".")[-2:])
if topsite not in sitemax or sitemax[topsite] < score:
sitemax[topsite] = score
logging.debug("max for %s: %i", topsite, score)
# Store attributes for debugging
self.soup = soup
self.sitemax = sitemax
self.sites = sites
for site in sitemax:
if sitemax[site] < 30:
raise RepScrapeException(
f"site {site} has score {sitemax[site]} < 30")
def main():
fetcher = Transcript()
for room in (6, 291, 109494, 228186): # python, rebol, friendly bin, git
info = fetcher.latest(room, "chat.stackoverflow.com")
print(info)
for room in (109983, 117114):
info = fetcher.latest(room, "chat.stackexchange.com")
print(info)
"""
# Really slow, will scroll all the way back to 2016
for room in (111347,): # SObotics
"""
for room in (233626,):
for message in fetcher.messages("chat.stackoverflow.com", room):
print(message)
if __name__ == '__main__':
main()