From 874b4b8dcacd9e4cac2f607bec27980900b7a5ae Mon Sep 17 00:00:00 2001 From: Eric Price Date: Sun, 21 Dec 2014 12:09:55 -0800 Subject: [PATCH] add more logging of scraper status --- website/frontend/management/commands/scraper.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/website/frontend/management/commands/scraper.py b/website/frontend/management/commands/scraper.py index 5aae424b..8e3cfd80 100644 --- a/website/frontend/management/commands/scraper.py +++ b/website/frontend/management/commands/scraper.py @@ -251,6 +251,7 @@ def add_to_git_repo(data, filename, article): if already_exists: if previous == data: + logger.debug('Article matches current version in repo') return None, None, None #Now check how many times this same version has appeared before @@ -330,7 +331,7 @@ def update_article(article): return to_store = unicode(parsed_article).encode('utf8') t = datetime.now() - + logger.debug('Article parsed; trying to store') v, boring, diff_info = add_to_git_repo(to_store, url_to_filename(article.url), article) @@ -351,11 +352,16 @@ def update_article(article): def update_articles(todays_git_dir): logger.info('Starting scraper; looking for new URLs') - for url in get_all_article_urls(): + all_urls = get_all_article_urls() + logger.info('Got all %s urls; storing to database' % len(all_urls)) + for i, url in enumerate(all_urls): + logger.debug('Woo: %d/%d is %s' % (i+1, len(all_urls), url)) if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue if not models.Article.objects.filter(url=url).count(): + logger.debug('Adding!') models.Article(url=url, git_dir=todays_git_dir).save() + logger.info('Done storing to database') def get_update_delay(minutes_since_update): days_since_update = minutes_since_update // (24 * 60)