Skip to content

Commit

Permalink
Fix downloading JPEGs, better rate limiting
Browse files Browse the repository at this point in the history
  • Loading branch information
wolever committed Mar 6, 2019
1 parent 6e8edf3 commit 3a3612c
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 33 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
0.3.0 (2019-03-06)
* Fix downloading compressed JPEGs (see "Note 2" in README)
* Better retrying on rate limit errors

0.2.2 (2017-08-28)
* Fix a couple crashes when initializing a fresh archive
* Download emoji
Expand Down
10 changes: 10 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,16 @@ API is sometimes incorrect. Because Wayslack will not delete files when the
local size does not match the remote size, a few warnings will almost always be
generated when deleting files (and, obviously, those files won't be deleted).

**Note 2**: Slack appears to compress JPEGs, so this check is not applied to
JPEGs. For all downloaded files, though, the etag is used to verify that the
download was not corrupt (even if it isn't identical to the file originally
uploaded).

For example::

$ wayslack --confirm-delete ~/.wayslack/your-archive/


__ https://stackoverflow.com/q/44742164/71522

Exporting Slack Messages to SQL (PostgreSQL)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

setup(
name="wayslack",
version="0.2.2",
version="0.3.0",
url="https://github.com/wolever/wayslack",
author="David Wolever",
author_email="[email protected]",
Expand Down
101 changes: 69 additions & 32 deletions wayslack.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import hashlib
import argparse
from Queue import Queue
from random import random
from threading import Thread
from itertools import groupby
from datetime import datetime, timedelta
Expand Down Expand Up @@ -53,6 +54,25 @@ def parse_age_str(s):

return datetime.now() - timedelta(days=count * multiplier)

def slack_retry(method, *args, **kwargs):
attempt = 1
while True:
try:
return method(*args, **kwargs)
except HTTPError as e:
if "Too Many Requests" not in str(e):
raise
# Note: introduce backoff + random delay so concurrent requests don't spam
delay = int(int(e.response.headers["Retry-After"]) * (2 ** (attempt * (1 * random()))))
delay = max(min, 30)
if VERBOSE:
print "Slack reported Too Many Requests for %r (retrying in %s seconds)" %(
method,
delay,
)
time.sleep(delay)
attempt += 1


VERBOSE = False

Expand Down Expand Up @@ -355,8 +375,16 @@ def _downloader(self, item):
in res.headers
),
))
hash = hashlib.md5()
for chunk in res.iter_content(4096):
hash.update(chunk)
f.write(chunk)
if hash.hexdigest() != res.headers["etag"]:
raise Exception("Downloading %r: checksum does not match. etag %r != md5 %r\n" %(
url,
res.headers["etag"],
hash.hexdigest(),
))
self.counter += 1
print "Downloaded %s (%s left): %s" %(
self.counter,
Expand Down Expand Up @@ -387,7 +415,11 @@ def is_file_missing(self, file_obj):
if not download_path.exists():
return "does not exist", download_path
size = download_path.stat().st_size
if size != file_obj["size"]:
# Note: Slack appears to compress JPEG files, so ignore this error if
# the image is a JPEG (the integrity will be checked by the downloader
# to ensure that the file content is correct, but it may not be
# identical to the file originally uploaded).
if size != file_obj["size"] and file_obj["mimetype"] != "image/jpeg":
msg = "size does not match (actual size %s != expected size %s)" %(
size,
file_obj["size"],
Expand Down Expand Up @@ -455,18 +487,11 @@ def load_messages(self, archive):
return json.load(f)

def _get_list(self, slack, latest_ts):
while True:
try:
return slack.history(
channel=self.id,
oldest=latest_ts,
count=1000,
)
except HTTPError as e:
if "Too Many Requests" not in str(e):
raise
print "Backing off after error (will retry in 5 seconds):", e
time.sleep(5)
return slack_retry(slack.history,
channel=self.id,
oldest=latest_ts,
count=1000,
)

def _refresh_messages(self):
latest_archive = next(self.iter_archives(reverse=True), None)
Expand Down Expand Up @@ -732,7 +757,7 @@ def iter_file_lists(self):
print "Walking backwards to find oldest file (this may take a little while)..."

while not self.status.get("oldest_file"):
resp = self.slack.files.list(
resp = slack_retry(self.slack.files.list,
ts_to=self.status.get("ts_to_oldest") or 0,
)
assert_successful(resp)
Expand All @@ -759,8 +784,8 @@ def iter_file_lists(self):
# Now start at the latest file we've seen before and walk forward!
while True:
resp = (
self.slack.files.list() if not self.status.get("ts_from_newest") else
self.slack.files.list(ts_from=self.status["ts_from_newest"])
slack_retry(self.slack.files.list) if not self.status.get("ts_from_newest") else
slack_retry(self.slack.files.list, ts_from=self.status["ts_from_newest"])
)

assert_successful(resp)
Expand Down Expand Up @@ -808,7 +833,7 @@ def delete_old_files(self, date, confirm):
def delete_file(x):
file_file, file_obj = x
try:
res = self.slack.files.delete(file_obj["id"])
res = slack_retry(self.slack.files.delete, file_obj["id"])
assert_successful(res)
except Error as e:
print "Error deleting file %r: %s" %(file_obj["id"], e.message)
Expand Down Expand Up @@ -966,23 +991,11 @@ def delete_old_files(self, confirm=False):


def args_get_archives(args):
for a in args.archive:
token, _, path = a.rpartition(":")
path = os.path.expanduser(path)
if not os.path.isdir(path):
print "Note: directory will be created: %s" %(path, )
while not token:
token = raw_input("API token for %s (see: https://api.slack.com/custom-integrations/legacy-tokens): " %(path, ))
yield {
"token": token,
"dir": path,
"name": path,
}

config_archives = []
default_config_file = os.path.expanduser("~/.wayslack/config.yaml")
config_file = (
args.config if args.config else
default_config_file if os.path.exists(default_config_file) and not args.archive else
default_config_file if os.path.exists(default_config_file) else
None
)
if config_file:
Expand All @@ -991,7 +1004,31 @@ def args_get_archives(args):
archive.setdefault("name", archive["dir"])
archive["dir"] = os.path.expanduser(archive["dir"])
archive["dir"] = os.path.join(os.path.dirname(config_file), archive["dir"])
yield archive
config_archives.append(archive)

if not args.archive:
for ca in config_archives:
yield ca
return

for a in args.archive:
token, _, path = a.rpartition(":")
path = os.path.expanduser(path)
for ca in config_archives:
if ca["dir"].rstrip("/") == path.rstrip("/"):
yield ca
break
else:
if not os.path.isdir(path):
print "Note: directory will be created: %s" %(path, )
while not token:
token = raw_input("API token for %s (see: https://api.slack.com/custom-integrations/legacy-tokens): " %(path, ))
yield {
"token": token,
"dir": path,
"name": path,
}


def main(argv=None):
global VERBOSE
Expand Down

0 comments on commit 3a3612c

Please sign in to comment.