From 127a9247789cea19a720a209be537bf9ad40522c Mon Sep 17 00:00:00 2001
From: Rob Brackett <rob@robbrackett.com>
Date: Mon, 9 Dec 2019 10:40:59 -0800
Subject: [PATCH] Reorganize content of web_monitoring into directories (#528)

This reorganizes the content of web_monitoring into a hierarchy of modules for easier management and comprehension. See the discussion in #206 for more.

    scripts/  # Stubs for things in web_monitoring/cli
        annotations_import
        ia_healthcheck
        wm
        wm-diffing-server
    web_monitoring/
        tests/
            [same as today]
        diff/
            content_type.py
            differs.py
            diff_errors.py
            html_diff_render.py
            links_diff.py
        diff_server/
            server.py
        cli/
            cli.py
            ia_healthcheck.py
            ia_import.py
            annotations_import.py
        __init__.py
        _version.py
        utils.py
        db.py

This also drops `filtering.py`, which was vestigial and no longer used.
---
 scripts/annotations_import                    | 164 +----------------
 scripts/ia_healthcheck                        | 125 +------------
 scripts/wm                                    |   2 +-
 scripts/wm-diffing-server                     |   3 +-
 web_monitoring/cli/annotations_import.py      | 165 ++++++++++++++++++
 web_monitoring/{ => cli}/cli.py               |   0
 web_monitoring/cli/ia_healthcheck.py          | 125 +++++++++++++
 web_monitoring/{ => diff}/content_type.py     |   0
 web_monitoring/{ => diff}/diff_errors.py      |   0
 web_monitoring/{ => diff}/differs.py          |   2 +-
 web_monitoring/{ => diff}/html_diff_render.py |   2 +-
 web_monitoring/{ => diff}/links_diff.py       |   2 +-
 .../server.py}                                |  38 ++--
 web_monitoring/filtering.py                   |  60 -------
 web_monitoring/tests/test_cli.py              |   4 +-
 web_monitoring/tests/test_differs.py          |   2 +-
 .../tests/test_diffing_server_exc_handling.py |   8 +-
 web_monitoring/tests/test_html_diff.py        |   4 +-
 .../tests/test_html_diff_validity.py          |   4 +-
 web_monitoring/tests/test_links_diff.py       |   4 +-
 20 files changed, 330 insertions(+), 384 deletions(-)
 create mode 100644 web_monitoring/cli/annotations_import.py
 rename web_monitoring/{ => cli}/cli.py (100%)
 create mode 100644 web_monitoring/cli/ia_healthcheck.py
 rename web_monitoring/{ => diff}/content_type.py (100%)
 rename web_monitoring/{ => diff}/diff_errors.py (100%)
 rename web_monitoring/{ => diff}/differs.py (99%)
 rename web_monitoring/{ => diff}/html_diff_render.py (99%)
 rename web_monitoring/{ => diff}/links_diff.py (99%)
 rename web_monitoring/{diffing_server.py => diff_server/server.py} (94%)
 delete mode 100644 web_monitoring/filtering.py

diff --git a/scripts/annotations_import b/scripts/annotations_import
index dd1a71185..c21a52eb1 100755
--- a/scripts/annotations_import
+++ b/scripts/annotations_import
@@ -1,168 +1,6 @@
 #!/usr/bin/env python
-import csv
-from docopt import docopt
-import logging
-import os
-import re
-from tqdm import tqdm
-from web_monitoring import db
+from web_monitoring.cli.annotations_import import main
 
-logger = logging.getLogger(__name__)
-log_level = os.getenv('LOG_LEVEL', 'WARNING')
-logger.setLevel(logging.__dict__[log_level])
-
-class DictReaderStrip(csv.DictReader):
-    @property
-    def fieldnames(self):
-        return [name.strip() for name in super().fieldnames]
-
-def read_csv(csv_path):
-    with open(csv_path, newline='') as csvfile:
-        reader = DictReaderStrip(csvfile)
-        for row in reader:
-            yield row
-
-DIFF_URL_REGEX = re.compile(r'^.*/page/(.*)/(.*)\.\.(.*)')
-def find_change_ids(csv_row):
-    diff_url = csv_row['Last Two - Side by Side']
-    regex_result = DIFF_URL_REGEX.match(diff_url)
-    if regex_result:
-        (page_id, from_version_id, to_version_id) = regex_result.groups()
-        return {'page_id': page_id,
-                'from_version_id': from_version_id,
-                'to_version_id': to_version_id}
-    else:
-        return None
-
-class AnnotationAttributeInfo:
-    def __init__(self, column_names, json_key):
-        self.column_names = column_names
-        self.json_key = json_key
-
-class CsvSchemaError(Exception):
-    ...
-
-# If column names ever change while leaving the value semantics intact,
-# add the new  name to the correct list of column names here
-BOOL_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [
-    (['Language alteration'],
-     'language_alteration'),
-    (['Link change/addition/removal'],
-     'link_change'),
-    (['Repeated Change across many pages or a domain'],
-     'repeated_change'),
-    (['Alteration within sections of a webpage'],
-     'alteration_within_sections'),
-    (['Alteration, removal, or addition of entire section(s) of a webpage'],
-     'alteration_entire_sections'),
-    (['Alteration, removal, or addition of an entire webpage or document'],
-     'alteration_entire_webpage_or_document'),
-    (['Overhaul, removal, or addition of an entire website'],
-     'alteration_entire_website'),
-    (['Alteration, removal, or addition of datasets'],
-     'alteration_dataset')]]
-
-STRING_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [
-    (['Is this primarily a content or access change (or both)?'],
-     'content_or_access_change'),
-    (['Brief Description'],
-     'brief_description'),
-    (['Topic 1'],
-     'topic_1'),
-    (['Subtopic 1a'],
-     'subtopic_1a'),
-    (['Subtopic 1b'],
-     'subtopic_1b'),
-    (['Topic 2'],
-     'topic_2'),
-    (['Subtopic 2a'],
-     'subtopic_2a'),
-    (['Subtopic 2b'],
-     'subtopic_2b'),
-    (['Topic 3'],
-     'topic_3'),
-    (['Subtopic 3a'],
-     'subtopic_3a'),
-    (['Subtopic 3b'],
-     'subtopic_3b'),
-    (['Any keywords to monitor (e.g. for term analyses)?'],
-     'keywords_to_monitor'),
-    (['Further Notes'],
-     'further_notes'),
-    (['Ask/tell other working groups?'],
-     'ask_tell_other_working_groups'),
-
-    # Including this so that we can eventually map it to
-    # users in the database
-    (['Who Found This?'],
-     'annotation_author')]]
-
-def get_attribute_value(attribute_info, csv_row):
-    for column_name in attribute_info.column_names:
-        if column_name in csv_row:
-            return csv_row[column_name].strip()
-
-    # Despite being raised in a row-level function, this error means that the
-    # whole sheet is missing a column, so we don't catch and allow it to crash
-    raise CsvSchemaError(f'Expected to find one of {attribute_info.column_names} '
-                         f'in {csv_row.keys()}')
-
-def create_annotation(csv_row, is_important_changes):
-    annotation = {}
-
-    for attribute_info in BOOL_ANNOTATION_ATTRIBUTES:
-        attribute_value = get_attribute_value(attribute_info, csv_row)
-        annotation[attribute_info.json_key] = attribute_value == '1'
-    for attribute_info in STRING_ANNOTATION_ATTRIBUTES:
-        attribute_value = get_attribute_value(attribute_info, csv_row)
-        annotation[attribute_info.json_key] = attribute_value
-
-    # This will need additional logic to determine the actual sheet schema
-    annotation['annotation_schema'] = 'edgi_analyst_v2'
-
-    significance = 0.0
-    if is_important_changes:
-        importance_significance_mapping = {
-            'low': 0.5,
-            'medium': 0.75,
-            'high': 1.0
-        }
-        row_importance = csv_row['Importance?'].lower().strip()
-        significance = importance_significance_mapping.get(row_importance, 0.0)
-    annotation['significance'] = significance
-
-    return annotation
-
-def main():
-    doc = """Add analyst annotations from a csv file to the Web Monitoring db.
-
-Usage:
-path/to/annotations_import.py <csv_path> [--is_important_changes]
-
-Options:
---is_important_changes  Was this CSV generated from an Important Changes sheet?
-"""
-    arguments = docopt(doc)
-    is_important_changes = arguments['--is_important_changes']
-    csv_path = arguments['<csv_path>']
-
-    client = db.Client.from_env()
-    # Missing step: Analyze CSV to determine spreadsheet schema version
-    for row in tqdm(read_csv(csv_path), unit=' rows'):
-        change_ids = find_change_ids(row)
-        annotation = create_annotation(row, is_important_changes)
-        if not change_ids:
-            logger.warning(f'failed to extract IDs from {row}')
-        if not annotation:
-            logger.warning(f'failed to extract annotation data from {row}')
-        if change_ids and annotation:
-            try:
-                response = client.add_annotation(**change_ids,
-                                                 annotation=annotation)
-                logger.debug(response)
-            except db.WebMonitoringDbError as e:
-                logger.warning(
-                    f'failed to post annotation for row {row} with error: {e}')
 
 if __name__ == '__main__':
     main()
diff --git a/scripts/ia_healthcheck b/scripts/ia_healthcheck
index 4c6351aa9..b0722f48f 100755
--- a/scripts/ia_healthcheck
+++ b/scripts/ia_healthcheck
@@ -1,125 +1,6 @@
 #!/usr/bin/env python
+from web_monitoring.cli.ia_healthcheck import main
 
-# This script checks whether the Internet Archive's Wayback Machine has
-# recent captures of the URLs we are tracking in the Web Monitoring Database.
-# It works by taking a random sample of pages from the DB and using the CDX API
-# to check that each has been captured at least once in the last few days.
 
-from datetime import datetime, timedelta
-import random
-import sentry_sdk
-import sys
-from web_monitoring import db
-from wayback import WaybackClient
-
-
-# The current Sentry client truncates string values at 512 characters. It
-# appears that monkey-patching this module global is only way to change it and
-# that doing so is the intended method:
-#   https://github.com/getsentry/sentry-python/blob/5f9f7c469af16a731948a482ea162c2348800999/sentry_sdk/utils.py#L662-L664
-# That doesn't seem great, so I've asked about this on their forums:
-#   https://forum.sentry.io/t/some-stack-traces-are-truncated/7309/4
-sentry_sdk.utils.MAX_STRING_LENGTH = 2048
-
-
-MAX_CAPTURE_AGE = timedelta(hours=72)
-LINKS_TO_CHECK = 10
-
-# Sentry automatically instantiates with the `SENTRY_DSN` environment variable.
-# If not set, all its methods will operate conveniently as no-ops.
-sentry_sdk.init()
-
-
-def sample_monitored_urls(sample_size):
-    """
-    Get a random sample of `sample_size` URLs that are tracked in a Web
-    Monitoring DB instance.
-
-    Returns
-    -------
-    list of string
-    """
-    client = db.Client.from_env()
-    page = client.list_pages(chunk=1, chunk_size=1, active=True, include_total=True)
-    url_count = page['meta']['total_results']
-    return (get_page_url(client, index)
-            for index in random.sample(range(url_count), sample_size))
-
-
-def get_page_url(client, index):
-    return client.list_pages(chunk=index, chunk_size=1, active=True)['data'][0]['url']
-
-
-def wayback_has_captures(url, from_date=None):
-    """
-    Determine whether the Wayback Machine has any recent captures of a URL.
-
-    Parameters
-    ----------
-    url : string
-
-    Returns
-    -------
-    list of JSON
-    """
-    with WaybackClient() as wayback:
-        versions = wayback.search(url, from_date=from_date)
-        try:
-            next(versions)
-        except StopIteration:
-            return False
-        else:
-            return True
-
-
-def output_results(statuses):
-    """
-    Output nicely formatted results.
-
-    Parameters
-    ----------
-    statuses: sequence of tuple of (str, bool)
-    """
-    healthy_links = 0
-    unhealthy_links = 0
-
-    logs = []
-    for (url, status) in statuses:
-        if status:
-            healthy_links += 1
-            status_text = '✔︎    Found'
-        else:
-            unhealthy_links += 1
-            status_text = '✘  Missing'
-
-        message = f'{status_text}: {url}'
-        print(message)
-        logs.append(message)
-
-    # At this point, everything is OK; we don't need breadcrumbs and other
-    # extra noise to come with the message we are about to send.
-    with sentry_sdk.configure_scope() as scope:
-        scope.clear()
-
-    if healthy_links + unhealthy_links == 0:
-        print('Failed to sampled any pages!')
-        sentry_sdk.capture_message('Failed to sampled any pages!')
-    else:
-        message = f'\nFound: {healthy_links} healthy links and {unhealthy_links} unhealthy links.'
-        print(message)
-        if unhealthy_links > 0:
-            log_string = '\n'.join(logs)
-            sentry_sdk.capture_message(f'{message}\n{log_string}')
-
-
-if __name__ == "__main__":
-    try:
-        print(f'Sampling {LINKS_TO_CHECK} pages from Web Monitoring API...')
-        links = sample_monitored_urls(LINKS_TO_CHECK)
-        from_date = datetime.now() - MAX_CAPTURE_AGE
-        print(f'Checking for captures in Wayback Machine...')
-        capture_statuses = ((url, wayback_has_captures(url, from_date))
-                            for url in links)
-        output_results(capture_statuses)
-    except db.MissingCredentials as error:
-        print(error, file=sys.stderr)
+if __name__ == '__main__':
+    main()
diff --git a/scripts/wm b/scripts/wm
index 3b8f06086..d794b7016 100755
--- a/scripts/wm
+++ b/scripts/wm
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from web_monitoring.cli import main
+from web_monitoring.cli.cli import main
 
 
 if __name__ == '__main__':
diff --git a/scripts/wm-diffing-server b/scripts/wm-diffing-server
index 5cb8d6c90..b02b606f9 100755
--- a/scripts/wm-diffing-server
+++ b/scripts/wm-diffing-server
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
-import sys
-from web_monitoring.diffing_server import cli
+from web_monitoring.diff_server.server import cli
 
 
 if __name__ == '__main__':
diff --git a/web_monitoring/cli/annotations_import.py b/web_monitoring/cli/annotations_import.py
new file mode 100644
index 000000000..008d03ca0
--- /dev/null
+++ b/web_monitoring/cli/annotations_import.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python
+import csv
+from docopt import docopt
+import logging
+import os
+import re
+from tqdm import tqdm
+from web_monitoring import db
+
+logger = logging.getLogger(__name__)
+log_level = os.getenv('LOG_LEVEL', 'WARNING')
+logger.setLevel(logging.__dict__[log_level])
+
+class DictReaderStrip(csv.DictReader):
+    @property
+    def fieldnames(self):
+        return [name.strip() for name in super().fieldnames]
+
+def read_csv(csv_path):
+    with open(csv_path, newline='') as csvfile:
+        reader = DictReaderStrip(csvfile)
+        for row in reader:
+            yield row
+
+DIFF_URL_REGEX = re.compile(r'^.*/page/(.*)/(.*)\.\.(.*)')
+def find_change_ids(csv_row):
+    diff_url = csv_row['Last Two - Side by Side']
+    regex_result = DIFF_URL_REGEX.match(diff_url)
+    if regex_result:
+        (page_id, from_version_id, to_version_id) = regex_result.groups()
+        return {'page_id': page_id,
+                'from_version_id': from_version_id,
+                'to_version_id': to_version_id}
+    else:
+        return None
+
+class AnnotationAttributeInfo:
+    def __init__(self, column_names, json_key):
+        self.column_names = column_names
+        self.json_key = json_key
+
+class CsvSchemaError(Exception):
+    ...
+
+# If column names ever change while leaving the value semantics intact,
+# add the new  name to the correct list of column names here
+BOOL_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [
+    (['Language alteration'],
+     'language_alteration'),
+    (['Link change/addition/removal'],
+     'link_change'),
+    (['Repeated Change across many pages or a domain'],
+     'repeated_change'),
+    (['Alteration within sections of a webpage'],
+     'alteration_within_sections'),
+    (['Alteration, removal, or addition of entire section(s) of a webpage'],
+     'alteration_entire_sections'),
+    (['Alteration, removal, or addition of an entire webpage or document'],
+     'alteration_entire_webpage_or_document'),
+    (['Overhaul, removal, or addition of an entire website'],
+     'alteration_entire_website'),
+    (['Alteration, removal, or addition of datasets'],
+     'alteration_dataset')]]
+
+STRING_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [
+    (['Is this primarily a content or access change (or both)?'],
+     'content_or_access_change'),
+    (['Brief Description'],
+     'brief_description'),
+    (['Topic 1'],
+     'topic_1'),
+    (['Subtopic 1a'],
+     'subtopic_1a'),
+    (['Subtopic 1b'],
+     'subtopic_1b'),
+    (['Topic 2'],
+     'topic_2'),
+    (['Subtopic 2a'],
+     'subtopic_2a'),
+    (['Subtopic 2b'],
+     'subtopic_2b'),
+    (['Topic 3'],
+     'topic_3'),
+    (['Subtopic 3a'],
+     'subtopic_3a'),
+    (['Subtopic 3b'],
+     'subtopic_3b'),
+    (['Any keywords to monitor (e.g. for term analyses)?'],
+     'keywords_to_monitor'),
+    (['Further Notes'],
+     'further_notes'),
+    (['Ask/tell other working groups?'],
+     'ask_tell_other_working_groups'),
+
+    # Including this so that we can eventually map it to
+    # users in the database
+    (['Who Found This?'],
+     'annotation_author')]]
+
+def get_attribute_value(attribute_info, csv_row):
+    for column_name in attribute_info.column_names:
+        if column_name in csv_row:
+            return csv_row[column_name].strip()
+
+    # Despite being raised in a row-level function, this error means that the
+    # whole sheet is missing a column, so we don't catch and allow it to crash
+    raise CsvSchemaError(f'Expected to find one of {attribute_info.column_names} '
+                         f'in {csv_row.keys()}')
+
+def create_annotation(csv_row, is_important_changes):
+    annotation = {}
+
+    for attribute_info in BOOL_ANNOTATION_ATTRIBUTES:
+        attribute_value = get_attribute_value(attribute_info, csv_row)
+        annotation[attribute_info.json_key] = attribute_value == '1'
+    for attribute_info in STRING_ANNOTATION_ATTRIBUTES:
+        attribute_value = get_attribute_value(attribute_info, csv_row)
+        annotation[attribute_info.json_key] = attribute_value
+
+    # This will need additional logic to determine the actual sheet schema
+    annotation['annotation_schema'] = 'edgi_analyst_v2'
+
+    significance = 0.0
+    if is_important_changes:
+        importance_significance_mapping = {
+            'low': 0.5,
+            'medium': 0.75,
+            'high': 1.0
+        }
+        row_importance = csv_row['Importance?'].lower().strip()
+        significance = importance_significance_mapping.get(row_importance, 0.0)
+    annotation['significance'] = significance
+
+    return annotation
+
+def main():
+    doc = """Add analyst annotations from a csv file to the Web Monitoring db.
+
+Usage:
+path/to/annotations_import.py <csv_path> [--is_important_changes]
+
+Options:
+--is_important_changes  Was this CSV generated from an Important Changes sheet?
+"""
+    arguments = docopt(doc)
+    is_important_changes = arguments['--is_important_changes']
+    csv_path = arguments['<csv_path>']
+
+    client = db.Client.from_env()
+    # Missing step: Analyze CSV to determine spreadsheet schema version
+    for row in tqdm(read_csv(csv_path), unit=' rows'):
+        change_ids = find_change_ids(row)
+        annotation = create_annotation(row, is_important_changes)
+        if not change_ids:
+            logger.warning(f'failed to extract IDs from {row}')
+        if not annotation:
+            logger.warning(f'failed to extract annotation data from {row}')
+        if change_ids and annotation:
+            try:
+                response = client.add_annotation(**change_ids,
+                                                 annotation=annotation)
+                logger.debug(response)
+            except db.WebMonitoringDbError as e:
+                logger.warning(
+                    f'failed to post annotation for row {row} with error: {e}')
diff --git a/web_monitoring/cli.py b/web_monitoring/cli/cli.py
similarity index 100%
rename from web_monitoring/cli.py
rename to web_monitoring/cli/cli.py
diff --git a/web_monitoring/cli/ia_healthcheck.py b/web_monitoring/cli/ia_healthcheck.py
new file mode 100644
index 000000000..ace46095f
--- /dev/null
+++ b/web_monitoring/cli/ia_healthcheck.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+
+# This script checks whether the Internet Archive's Wayback Machine has
+# recent captures of the URLs we are tracking in the Web Monitoring Database.
+# It works by taking a random sample of pages from the DB and using the CDX API
+# to check that each has been captured at least once in the last few days.
+
+from datetime import datetime, timedelta
+import random
+import sentry_sdk
+import sys
+from wayback import WaybackClient
+from .. import db
+
+
+# The current Sentry client truncates string values at 512 characters. It
+# appears that monkey-patching this module global is only way to change it and
+# that doing so is the intended method:
+#   https://github.com/getsentry/sentry-python/blob/5f9f7c469af16a731948a482ea162c2348800999/sentry_sdk/utils.py#L662-L664
+# That doesn't seem great, so I've asked about this on their forums:
+#   https://forum.sentry.io/t/some-stack-traces-are-truncated/7309/4
+sentry_sdk.utils.MAX_STRING_LENGTH = 2048
+
+
+MAX_CAPTURE_AGE = timedelta(hours=72)
+LINKS_TO_CHECK = 10
+
+# Sentry automatically instantiates with the `SENTRY_DSN` environment variable.
+# If not set, all its methods will operate conveniently as no-ops.
+sentry_sdk.init()
+
+
+def sample_monitored_urls(sample_size):
+    """
+    Get a random sample of `sample_size` URLs that are tracked in a Web
+    Monitoring DB instance.
+
+    Returns
+    -------
+    list of string
+    """
+    client = db.Client.from_env()
+    page = client.list_pages(chunk=1, chunk_size=1, active=True, include_total=True)
+    url_count = page['meta']['total_results']
+    return (get_page_url(client, index)
+            for index in random.sample(range(url_count), sample_size))
+
+
+def get_page_url(client, index):
+    return client.list_pages(chunk=index, chunk_size=1, active=True)['data'][0]['url']
+
+
+def wayback_has_captures(url, from_date=None):
+    """
+    Determine whether the Wayback Machine has any recent captures of a URL.
+
+    Parameters
+    ----------
+    url : string
+
+    Returns
+    -------
+    list of JSON
+    """
+    with WaybackClient() as wayback:
+        versions = wayback.search(url, from_date=from_date)
+        try:
+            next(versions)
+        except StopIteration:
+            return False
+        else:
+            return True
+
+
+def output_results(statuses):
+    """
+    Output nicely formatted results.
+
+    Parameters
+    ----------
+    statuses: sequence of tuple of (str, bool)
+    """
+    healthy_links = 0
+    unhealthy_links = 0
+
+    logs = []
+    for (url, status) in statuses:
+        if status:
+            healthy_links += 1
+            status_text = '✔︎    Found'
+        else:
+            unhealthy_links += 1
+            status_text = '✘  Missing'
+
+        message = f'{status_text}: {url}'
+        print(message)
+        logs.append(message)
+
+    # At this point, everything is OK; we don't need breadcrumbs and other
+    # extra noise to come with the message we are about to send.
+    with sentry_sdk.configure_scope() as scope:
+        scope.clear()
+
+    if healthy_links + unhealthy_links == 0:
+        print('Failed to sampled any pages!')
+        sentry_sdk.capture_message('Failed to sampled any pages!')
+    else:
+        message = f'\nFound: {healthy_links} healthy links and {unhealthy_links} unhealthy links.'
+        print(message)
+        if unhealthy_links > 0:
+            log_string = '\n'.join(logs)
+            sentry_sdk.capture_message(f'{message}\n{log_string}')
+
+
+def main():
+    try:
+        print(f'Sampling {LINKS_TO_CHECK} pages from Web Monitoring API...')
+        links = sample_monitored_urls(LINKS_TO_CHECK)
+        from_date = datetime.now() - MAX_CAPTURE_AGE
+        print(f'Checking for captures in Wayback Machine...')
+        capture_statuses = ((url, wayback_has_captures(url, from_date))
+                            for url in links)
+        output_results(capture_statuses)
+    except db.MissingCredentials as error:
+        print(error, file=sys.stderr)
diff --git a/web_monitoring/content_type.py b/web_monitoring/diff/content_type.py
similarity index 100%
rename from web_monitoring/content_type.py
rename to web_monitoring/diff/content_type.py
diff --git a/web_monitoring/diff_errors.py b/web_monitoring/diff/diff_errors.py
similarity index 100%
rename from web_monitoring/diff_errors.py
rename to web_monitoring/diff/diff_errors.py
diff --git a/web_monitoring/differs.py b/web_monitoring/diff/differs.py
similarity index 99%
rename from web_monitoring/differs.py
rename to web_monitoring/diff/differs.py
index 41f7b47da..44ef9f7e5 100644
--- a/web_monitoring/differs.py
+++ b/web_monitoring/diff/differs.py
@@ -1,6 +1,6 @@
 from bs4 import Comment
 from diff_match_patch import diff, diff_bytes
-from web_monitoring.utils import get_color_palette
+from ..utils import get_color_palette
 from htmldiffer.diff import HTMLDiffer
 import htmltreediff
 import html5_parser
diff --git a/web_monitoring/html_diff_render.py b/web_monitoring/diff/html_diff_render.py
similarity index 99%
rename from web_monitoring/html_diff_render.py
rename to web_monitoring/diff/html_diff_render.py
index 00e15f2d1..d6fe0606a 100644
--- a/web_monitoring/html_diff_render.py
+++ b/web_monitoring/diff/html_diff_render.py
@@ -20,7 +20,7 @@
 from functools import lru_cache
 import copy
 import difflib
-from web_monitoring.utils import get_color_palette
+from ..utils import get_color_palette
 import html
 import html5_parser
 import logging
diff --git a/web_monitoring/links_diff.py b/web_monitoring/diff/links_diff.py
similarity index 99%
rename from web_monitoring/links_diff.py
rename to web_monitoring/diff/links_diff.py
index 0b053079c..634d8873b 100644
--- a/web_monitoring/links_diff.py
+++ b/web_monitoring/diff/links_diff.py
@@ -1,7 +1,7 @@
 import html5_parser
 from .content_type import raise_if_not_diffable_html
 from .differs import compute_dmp_diff
-from web_monitoring.utils import get_color_palette
+from ..utils import get_color_palette
 from difflib import SequenceMatcher
 from .html_diff_render import (get_title, _html_for_dmp_operation,
                                undiffable_content_tags)
diff --git a/web_monitoring/diffing_server.py b/web_monitoring/diff_server/server.py
similarity index 94%
rename from web_monitoring/diffing_server.py
rename to web_monitoring/diff_server/server.py
index 6892dc157..468a54639 100644
--- a/web_monitoring/diffing_server.py
+++ b/web_monitoring/diff_server/server.py
@@ -14,10 +14,8 @@
 import tornado.web
 import traceback
 import web_monitoring
-import web_monitoring.differs
-from web_monitoring.diff_errors import UndiffableContentError, UndecodableContentError
-import web_monitoring.html_diff_render
-import web_monitoring.links_diff
+from ..diff import differs, html_diff_render, links_diff
+from ..diff.diff_errors import UndiffableContentError, UndecodableContentError
 
 # Track errors with Sentry.io. It will automatically detect the `SENTRY_DSN`
 # environment variable. If not set, all its methods will operate conveniently
@@ -32,26 +30,26 @@
 # Map tokens in the REST API to functions in modules.
 # The modules do not have to be part of the web_monitoring package.
 DIFF_ROUTES = {
-    "length": web_monitoring.differs.compare_length,
-    "identical_bytes": web_monitoring.differs.identical_bytes,
-    "side_by_side_text": web_monitoring.differs.side_by_side_text,
-    "links": web_monitoring.links_diff.links_diff_html,
-    "links_json": web_monitoring.links_diff.links_diff_json,
+    "length": differs.compare_length,
+    "identical_bytes": differs.identical_bytes,
+    "side_by_side_text": differs.side_by_side_text,
+    "links": links_diff.links_diff_html,
+    "links_json": links_diff.links_diff_json,
     # applying diff-match-patch (dmp) to strings (no tokenization)
-    "html_text_dmp": web_monitoring.differs.html_text_diff,
-    "html_source_dmp": web_monitoring.differs.html_source_diff,
+    "html_text_dmp": differs.html_text_diff,
+    "html_source_dmp": differs.html_source_diff,
     # three different approaches to the same goal:
-    "html_token": web_monitoring.html_diff_render.html_diff_render,
-    "html_tree": web_monitoring.differs.html_tree_diff,
-    "html_perma_cc": web_monitoring.differs.html_differ,
+    "html_token": html_diff_render.html_diff_render,
+    "html_tree": differs.html_tree_diff,
+    "html_perma_cc": differs.html_differ,
 
     # deprecated synonyms
-    "links_diff": web_monitoring.links_diff.links_diff,
-    "html_text_diff": web_monitoring.differs.html_text_diff,
-    "html_source_diff": web_monitoring.differs.html_source_diff,
-    "html_visual_diff": web_monitoring.html_diff_render.html_diff_render,
-    "html_tree_diff": web_monitoring.differs.html_tree_diff,
-    "html_differ": web_monitoring.differs.html_differ,
+    "links_diff": links_diff.links_diff,
+    "html_text_diff": differs.html_text_diff,
+    "html_source_diff": differs.html_source_diff,
+    "html_visual_diff": html_diff_render.html_diff_render,
+    "html_tree_diff": differs.html_tree_diff,
+    "html_differ": differs.html_differ,
 }
 
 # Matches a <meta> tag in HTML used to specify the character encoding:
diff --git a/web_monitoring/filtering.py b/web_monitoring/filtering.py
deleted file mode 100644
index 536738138..000000000
--- a/web_monitoring/filtering.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from urllib.parse import urlparse
-import html5_parser
-
-
-day_list = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
-month_list = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
-tag_list = ['<td class="c" id="displayMonthEl"', '<td class="c" id="displayDayEl"', '<td class="c" id="displayYearEl"']
-social_media = ['twitter.com', 'platform.twitter.com']
-
-
-def df_filter(df):
-
-    df['review'] = 'yes'
-    df['id'] = 'none'
-    df['priority'] = 1.0
-    df['annotation'] = 'no annotation'
-
-    for index,row in df.iterrows():
-        if ((str(row['new']).lower() in month_list) and (str(row['old']).lower() in month_list)):
-            df.loc[index] = df.loc[index].replace(df.loc[index]['review'], 'no')
-            df.loc[index] = df.loc[index].replace(df.loc[index]['id'], 'Date/Time')
-            df.loc[index] = df.loc[index].replace(df.loc[index]['priority'], 0.1)
-            df.loc[index] = df.loc[index].replace(df.loc[index]['annotation'], 'Repeated Changes')
-
-        for s in tag_list:
-            if (((s in str(row['new'])) and (s in str(row['old'])))):
-                df.loc[index] = df.loc[index].replace(df.loc[index]['review'], 'no')
-                df.loc[index] = df.loc[index].replace(df.loc[index]['id'], 'Date/Time')
-                df.loc[index] = df.loc[index].replace(df.loc[index]['priority'], 0.1)
-                df.loc[index] = df.loc[index].replace(df.loc[index]['annotation'], 'Repeated Changes')
-                break
-
-        if (str(row['state']) == 'Change'):
-            social_soup = html5_parser.parse(str(row['new']),
-                                             treebuilder='soup',
-                                             return_root=False)
-            social_list = list(social_soup.find_all(['a', 'script']))
-            for x in social_list:
-                if (x.name == 'a'):
-                    if ('href' in x.attrs.keys()):
-                        if (urlparse(x['href']).netloc in social_media):
-                            df.loc[index] = df.loc[index].replace(df.loc[index]['review'], 'no')
-                            df.loc[index] = df.loc[index].replace(df.loc[index]['id'], 'Social Media')
-                            df.loc[index] = df.loc[index].replace(df.loc[index]['priority'], 0.1)
-                            df.loc[index] = df.loc[index].replace(df.loc[index]['annotation'], 'Repeated Changes')
-                        if (urlparse(x['href']).scheme == 'mailto'):
-                            df.loc[index] = df.loc[index].replace(df.loc[index]['review'], 'no')
-                            df.loc[index] = df.loc[index].replace(df.loc[index]['id'], 'Contact info')
-                            df.loc[index] = df.loc[index].replace(df.loc[index]['priority'], 0.1)
-                            df.loc[index] = df.loc[index].replace(df.loc[index]['annotation'], 'Repeated Changes')
-
-            date_list = list(social_soup.find_all(['meta'], attrs={"http-equiv":"last-modified"}))
-            for y in date_list:
-                if (y['http-equiv'] == "last-modified"):
-                    df.loc[index] = df.loc[index].replace(df.loc[index]['review'], 'no')
-                    df.loc[index] = df.loc[index].replace(df.loc[index]['id'], 'Date/Time')
-                    df.loc[index] = df.loc[index].replace(df.loc[index]['priority'], 0.1)
-                    df.loc[index] = df.loc[index].replace(df.loc[index]['annotation'], 'Repeated Changes')
-
-    return df
diff --git a/web_monitoring/tests/test_cli.py b/web_monitoring/tests/test_cli.py
index 87f80f668..c9a41a23a 100644
--- a/web_monitoring/tests/test_cli.py
+++ b/web_monitoring/tests/test_cli.py
@@ -4,8 +4,8 @@
 from unittest.mock import patch
 import vcr
 from wayback import WaybackClient
-from web_monitoring.cli import (_filter_unchanged_versions,
-                                WaybackRecordsWorker, import_ia_db_urls)
+from web_monitoring.cli.cli import (_filter_unchanged_versions,
+                                    WaybackRecordsWorker, import_ia_db_urls)
 
 
 # The only matters when re-recording the tests for vcr.
diff --git a/web_monitoring/tests/test_differs.py b/web_monitoring/tests/test_differs.py
index 8def857d0..b273d7683 100644
--- a/web_monitoring/tests/test_differs.py
+++ b/web_monitoring/tests/test_differs.py
@@ -1,4 +1,4 @@
-import web_monitoring.differs as wd
+import web_monitoring.diff.differs as wd
 
 
 def test_side_by_side_text():
diff --git a/web_monitoring/tests/test_diffing_server_exc_handling.py b/web_monitoring/tests/test_diffing_server_exc_handling.py
index 6a249b225..f7d9daf02 100644
--- a/web_monitoring/tests/test_diffing_server_exc_handling.py
+++ b/web_monitoring/tests/test_diffing_server_exc_handling.py
@@ -6,8 +6,8 @@
 import tempfile
 from tornado.testing import AsyncHTTPTestCase
 from unittest.mock import patch
-import web_monitoring.diffing_server as df
-from web_monitoring.diff_errors import UndecodableContentError
+import web_monitoring.diff_server.server as df
+from web_monitoring.diff.diff_errors import UndecodableContentError
 import web_monitoring
 from tornado.escape import utf8
 from tornado.httpclient import HTTPResponse, AsyncHTTPClient
@@ -205,7 +205,7 @@ def test_accepts_errors_from_web_archives(self):
             self.assertEqual(response.code, 200)
             assert 'change_count' in json.loads(response.body)
 
-    @patch('web_monitoring.diffing_server.access_control_allow_origin_header', '*')
+    @patch('web_monitoring.diff_server.server.access_control_allow_origin_header', '*')
     def test_check_cors_headers(self):
         """
         Since we have set Access-Control-Allow-Origin: * on app init,
@@ -222,7 +222,7 @@ def test_check_cors_headers(self):
         assert response.headers.get('Access-Control-Allow-Headers') == 'x-requested-with'
         assert response.headers.get('Access-Control-Allow-Methods') == 'GET, OPTIONS'
 
-    @patch('web_monitoring.diffing_server.access_control_allow_origin_header',
+    @patch('web_monitoring.diff_server.server.access_control_allow_origin_header',
            'http://one.com,http://two.com,http://three.com')
     def test_cors_origin_header(self):
         """
diff --git a/web_monitoring/tests/test_html_diff.py b/web_monitoring/tests/test_html_diff.py
index fea9bf7ff..03eddf964 100644
--- a/web_monitoring/tests/test_html_diff.py
+++ b/web_monitoring/tests/test_html_diff.py
@@ -5,8 +5,8 @@
 from pkg_resources import resource_filename
 import pytest
 from web_monitoring.db import Client
-from web_monitoring.differs import html_tree_diff, html_differ
-from web_monitoring.html_diff_render import html_diff_render
+from web_monitoring.diff.differs import html_tree_diff, html_differ
+from web_monitoring.diff.html_diff_render import html_diff_render
 
 
 def lookup_pair(fn):
diff --git a/web_monitoring/tests/test_html_diff_validity.py b/web_monitoring/tests/test_html_diff_validity.py
index 54d2c1773..f878032db 100644
--- a/web_monitoring/tests/test_html_diff_validity.py
+++ b/web_monitoring/tests/test_html_diff_validity.py
@@ -11,8 +11,8 @@
 import html5_parser
 import pytest
 import re
-from web_monitoring.diff_errors import UndiffableContentError
-from web_monitoring.html_diff_render import html_diff_render
+from web_monitoring.diff.diff_errors import UndiffableContentError
+from web_monitoring.diff.html_diff_render import html_diff_render
 
 
 # TODO: extend these to other html differs via parameterization, a la
diff --git a/web_monitoring/tests/test_links_diff.py b/web_monitoring/tests/test_links_diff.py
index 18f43eea9..310b41878 100644
--- a/web_monitoring/tests/test_links_diff.py
+++ b/web_monitoring/tests/test_links_diff.py
@@ -1,8 +1,8 @@
 from pathlib import Path
 from pkg_resources import resource_filename
 import pytest
-from web_monitoring.diff_errors import UndiffableContentError
-from web_monitoring.links_diff import links_diff, links_diff_html
+from web_monitoring.diff.diff_errors import UndiffableContentError
+from web_monitoring.diff.links_diff import links_diff, links_diff_html
 
 
 def test_links_diff_only_includes_links():