From 72f73a0efee0aec315c4f53a80599e6cc29b9795 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 9 Nov 2019 09:10:08 -0800 Subject: [PATCH] upload improvements: - limit pages and bookmarks to 10000 - add settings to limit bookmarks and pages separately - include page and bookmark creation in progress bar before, page/bookmark creation was taking a long time but not included in progress update should fix #768, likely webrecorder/webrecorder-player#87, webrecorder/webrecorder-player#78, webrecorder/webrecorder-player#86 --- .../webrecorder/config/standalone_player.yaml | 2 -- webrecorder/webrecorder/config/wr.yaml | 3 +- webrecorder/webrecorder/models/importer.py | 33 ++++++++++++++----- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/webrecorder/webrecorder/config/standalone_player.yaml b/webrecorder/webrecorder/config/standalone_player.yaml index 12f4d5fd5..73434706f 100644 --- a/webrecorder/webrecorder/config/standalone_player.yaml +++ b/webrecorder/webrecorder/config/standalone_player.yaml @@ -13,8 +13,6 @@ init_import_coll: 'collection' cdxj_key_templ: 'c:{coll}:cdxj' coll_cdxj_ttl: -1 -max_detect_pages: 0 - upload_coll: id: 'collection' title: 'Web Archive Collection' diff --git a/webrecorder/webrecorder/config/wr.yaml b/webrecorder/webrecorder/config/wr.yaml index c84dc9b8a..3fad159c7 100644 --- a/webrecorder/webrecorder/config/wr.yaml +++ b/webrecorder/webrecorder/config/wr.yaml @@ -28,7 +28,8 @@ skip_key_secs: 330 open_rec_ttl: 5400 max_warc_size: 500000000 -max_detect_pages: 0 +max_detect_pages: 10000 +max_auto_bookmarks: 10000 assets_path: ./webrecorder/config/assets.yaml diff --git a/webrecorder/webrecorder/models/importer.py b/webrecorder/webrecorder/models/importer.py index 11150361a..c48d1741b 100644 --- a/webrecorder/webrecorder/models/importer.py +++ b/webrecorder/webrecorder/models/importer.py @@ -120,6 +120,7 @@ def __init__(self, redis, config, wam_loader=None): self.detect_list_info = config['page_detect_list'] self.max_detect_pages = config['max_detect_pages'] + self.max_auto_bookmarks = config['max_auto_bookmarks'] def handle_upload(self, stream, upload_id, upload_key, infos, filename, user, force_coll_name, total_size): @@ -195,7 +196,7 @@ def _init_upload_status(self, user, total_size, num_files, filename=None, expire with redis_pipeline(self.redis) as pi: pi.hset(upload_key, 'size', 0) - pi.hset(upload_key, 'total_size', total_size * 2) + pi.hset(upload_key, 'total_size', int(total_size * 2.5)) pi.hset(upload_key, 'total_files', num_files) pi.hset(upload_key, 'files', num_files) @@ -240,8 +241,7 @@ def run_upload(self, upload_key, filename, stream, user, rec_infos, total_size, else: logger.debug('SKIP upload for zero-length recording') - - self.process_pages(info, page_id_map) + self.process_pages(info, page_id_map, upload_key, total_size) diff = info['offset'] - last_end last_end = info['offset'] + info['length'] @@ -280,13 +280,13 @@ def run_upload(self, upload_key, filename, stream, user, rec_infos, total_size, first_coll.sync_coll_index(exists=False, do_async=False) first_coll.set_external_remove_on_expire() - def process_pages(self, info, page_id_map): + def process_pages(self, info, page_id_map, upload_key, total_size): pages = info.get('pages') # detect pages if none detected = False if pages is None: - pages = self.detect_pages(info['coll'], info['rec']) + pages = self.detect_pages(info['coll'], info['rec'], upload_key, total_size) detected = True # if no pages, nothing more to do @@ -303,9 +303,16 @@ def process_pages(self, info, page_id_map): if detected: blist = info['collection'].create_bookmark_list(self.detect_list_info) + # if set, further limit number of automatic bookmarks + if self.max_auto_bookmarks: + pages = pages[:self.max_auto_bookmarks] + + incr = int((total_size * 0.25) / len(pages)) + for page in pages: page['page_id'] = page['id'] bookmark = blist.create_bookmark(page, incr_stats=False) + self.redis.hincrby(upload_key, 'size', incr) def har2warc(self, filename, stream): """Convert HTTP Archive format file to WARC archive. @@ -437,7 +444,7 @@ def import_lists(self, collection, page_id_map): bookmark_data['page_id'] = page_id_map.get(page_id) bookmark = blist.create_bookmark(bookmark_data, incr_stats=False) - def detect_pages(self, coll, rec): + def detect_pages(self, coll, rec, upload_key, total_size): """Find pages in recording. :param str coll: collection ID @@ -449,17 +456,25 @@ def detect_pages(self, coll, rec): key = self.cdxj_key.format(coll=coll, rec=rec) pages = [] + count = 0 + + total_cdx = self.redis.zcard(key) + if self.max_detect_pages: + total_cdx = min(self.max_detect_pages, total_cdx) + + incr = int((total_size * 0.25) / total_cdx) #for member, score in self.redis.zscan_iter(key): - for member in self.redis.zrange(key, 0, -1): + for member, _ in zip(self.redis.zrange(key, 0, -1), range(total_cdx)): cdxj = CDXObject(member.encode('utf-8')) - if ((not self.max_detect_pages or len(pages) < self.max_detect_pages) - and self.is_page(cdxj)): + if self.is_page(cdxj): pages.append(dict(url=cdxj['url'], title=cdxj['url'], timestamp=cdxj['timestamp'])) + self.redis.hincrby(upload_key, 'size', incr) + return pages def is_page(self, cdxj):