Reorganize content of web_monitoring into directories (#528)

This reorganizes the content of web_monitoring into a hierarchy of modules for easier management and comprehension. See the discussion in #206 for more. scripts/ # Stubs for things in web_monitoring/cli annotations_import ia_healthcheck wm wm-diffing-server web_monitoring/ tests/ [same as today] diff/ content_type.py differs.py diff_errors.py html_diff_render.py links_diff.py diff_server/ server.py cli/ cli.py ia_healthcheck.py ia_import.py annotations_import.py __init__.py _version.py utils.py db.py This also drops `filtering.py`, which was vestigial and no longer used.
edgi-govdata-archiving · Dec 9, 2019 · 127a924 · 127a924
1 parent a4065b8
commit 127a924
Show file tree

Hide file tree

Showing 20 changed files with 330 additions and 384 deletions.
diff --git a/scripts/annotations_import b/scripts/annotations_import
@@ -1,168 +1,6 @@
 #!/usr/bin/env python
-import csv
-from docopt import docopt
-import logging
-import os
-import re
-from tqdm import tqdm
-from web_monitoring import db
+from web_monitoring.cli.annotations_import import main
 
-logger = logging.getLogger(__name__)
-log_level = os.getenv('LOG_LEVEL', 'WARNING')
-logger.setLevel(logging.__dict__[log_level])
-
-class DictReaderStrip(csv.DictReader):
-    @property
-    def fieldnames(self):
-        return [name.strip() for name in super().fieldnames]
-
-def read_csv(csv_path):
-    with open(csv_path, newline='') as csvfile:
-        reader = DictReaderStrip(csvfile)
-        for row in reader:
-            yield row
-
-DIFF_URL_REGEX = re.compile(r'^.*/page/(.*)/(.*)\.\.(.*)')
-def find_change_ids(csv_row):
-    diff_url = csv_row['Last Two - Side by Side']
-    regex_result = DIFF_URL_REGEX.match(diff_url)
-    if regex_result:
-        (page_id, from_version_id, to_version_id) = regex_result.groups()
-        return {'page_id': page_id,
-                'from_version_id': from_version_id,
-                'to_version_id': to_version_id}
-    else:
-        return None
-
-class AnnotationAttributeInfo:
-    def __init__(self, column_names, json_key):
-        self.column_names = column_names
-        self.json_key = json_key
-
-class CsvSchemaError(Exception):
-    ...
-
-# If column names ever change while leaving the value semantics intact,
-# add the new  name to the correct list of column names here
-BOOL_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [
-    (['Language alteration'],
-     'language_alteration'),
-    (['Link change/addition/removal'],
-     'link_change'),
-    (['Repeated Change across many pages or a domain'],
-     'repeated_change'),
-    (['Alteration within sections of a webpage'],
-     'alteration_within_sections'),
-    (['Alteration, removal, or addition of entire section(s) of a webpage'],
-     'alteration_entire_sections'),
-    (['Alteration, removal, or addition of an entire webpage or document'],
-     'alteration_entire_webpage_or_document'),
-    (['Overhaul, removal, or addition of an entire website'],
-     'alteration_entire_website'),
-    (['Alteration, removal, or addition of datasets'],
-     'alteration_dataset')]]
-
-STRING_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [
-    (['Is this primarily a content or access change (or both)?'],
-     'content_or_access_change'),
-    (['Brief Description'],
-     'brief_description'),
-    (['Topic 1'],
-     'topic_1'),
-    (['Subtopic 1a'],
-     'subtopic_1a'),
-    (['Subtopic 1b'],
-     'subtopic_1b'),
-    (['Topic 2'],
-     'topic_2'),
-    (['Subtopic 2a'],
-     'subtopic_2a'),
-    (['Subtopic 2b'],
-     'subtopic_2b'),
-    (['Topic 3'],
-     'topic_3'),
-    (['Subtopic 3a'],
-     'subtopic_3a'),
-    (['Subtopic 3b'],
-     'subtopic_3b'),
-    (['Any keywords to monitor (e.g. for term analyses)?'],
-     'keywords_to_monitor'),
-    (['Further Notes'],
-     'further_notes'),
-    (['Ask/tell other working groups?'],
-     'ask_tell_other_working_groups'),
-
-    # Including this so that we can eventually map it to
-    # users in the database
-    (['Who Found This?'],
-     'annotation_author')]]
-
-def get_attribute_value(attribute_info, csv_row):
-    for column_name in attribute_info.column_names:
-        if column_name in csv_row:
-            return csv_row[column_name].strip()
-
-    # Despite being raised in a row-level function, this error means that the
-    # whole sheet is missing a column, so we don't catch and allow it to crash
-    raise CsvSchemaError(f'Expected to find one of {attribute_info.column_names} '
-                         f'in {csv_row.keys()}')
-
-def create_annotation(csv_row, is_important_changes):
-    annotation = {}
-
-    for attribute_info in BOOL_ANNOTATION_ATTRIBUTES:
-        attribute_value = get_attribute_value(attribute_info, csv_row)
-        annotation[attribute_info.json_key] = attribute_value == '1'
-    for attribute_info in STRING_ANNOTATION_ATTRIBUTES:
-        attribute_value = get_attribute_value(attribute_info, csv_row)
-        annotation[attribute_info.json_key] = attribute_value
-
-    # This will need additional logic to determine the actual sheet schema
-    annotation['annotation_schema'] = 'edgi_analyst_v2'
-
-    significance = 0.0
-    if is_important_changes:
-        importance_significance_mapping = {
-            'low': 0.5,
-            'medium': 0.75,
-            'high': 1.0
-        }
-        row_importance = csv_row['Importance?'].lower().strip()
-        significance = importance_significance_mapping.get(row_importance, 0.0)
-    annotation['significance'] = significance
-
-    return annotation
-
-def main():
-    doc = """Add analyst annotations from a csv file to the Web Monitoring db.
-
-Usage:
-path/to/annotations_import.py <csv_path> [--is_important_changes]
-
-Options:
---is_important_changes  Was this CSV generated from an Important Changes sheet?
-"""
-    arguments = docopt(doc)
-    is_important_changes = arguments['--is_important_changes']
-    csv_path = arguments['<csv_path>']
-
-    client = db.Client.from_env()
-    # Missing step: Analyze CSV to determine spreadsheet schema version
-    for row in tqdm(read_csv(csv_path), unit=' rows'):
-        change_ids = find_change_ids(row)
-        annotation = create_annotation(row, is_important_changes)
-        if not change_ids:
-            logger.warning(f'failed to extract IDs from {row}')
-        if not annotation:
-            logger.warning(f'failed to extract annotation data from {row}')
-        if change_ids and annotation:
-            try:
-                response = client.add_annotation(**change_ids,
-                                                 annotation=annotation)
-                logger.debug(response)
-            except db.WebMonitoringDbError as e:
-                logger.warning(
-                    f'failed to post annotation for row {row} with error: {e}')
 
 if __name__ == '__main__':
     main()
diff --git a/scripts/ia_healthcheck b/scripts/ia_healthcheck
@@ -1,125 +1,6 @@
 #!/usr/bin/env python
+from web_monitoring.cli.ia_healthcheck import main
 
-# This script checks whether the Internet Archive's Wayback Machine has
-# recent captures of the URLs we are tracking in the Web Monitoring Database.
-# It works by taking a random sample of pages from the DB and using the CDX API
-# to check that each has been captured at least once in the last few days.
 
-from datetime import datetime, timedelta
-import random
-import sentry_sdk
-import sys
-from web_monitoring import db
-from wayback import WaybackClient
-
-
-# The current Sentry client truncates string values at 512 characters. It
-# appears that monkey-patching this module global is only way to change it and
-# that doing so is the intended method:
-#   https://github.com/getsentry/sentry-python/blob/5f9f7c469af16a731948a482ea162c2348800999/sentry_sdk/utils.py#L662-L664
-# That doesn't seem great, so I've asked about this on their forums:
-#   https://forum.sentry.io/t/some-stack-traces-are-truncated/7309/4
-sentry_sdk.utils.MAX_STRING_LENGTH = 2048
-
-
-MAX_CAPTURE_AGE = timedelta(hours=72)
-LINKS_TO_CHECK = 10
-
-# Sentry automatically instantiates with the `SENTRY_DSN` environment variable.
-# If not set, all its methods will operate conveniently as no-ops.
-sentry_sdk.init()
-
-
-def sample_monitored_urls(sample_size):
-    """
-    Get a random sample of `sample_size` URLs that are tracked in a Web
-    Monitoring DB instance.
-
-    Returns
-    -------
-    list of string
-    """
-    client = db.Client.from_env()
-    page = client.list_pages(chunk=1, chunk_size=1, active=True, include_total=True)
-    url_count = page['meta']['total_results']
-    return (get_page_url(client, index)
-            for index in random.sample(range(url_count), sample_size))
-
-
-def get_page_url(client, index):
-    return client.list_pages(chunk=index, chunk_size=1, active=True)['data'][0]['url']
-
-
-def wayback_has_captures(url, from_date=None):
-    """
-    Determine whether the Wayback Machine has any recent captures of a URL.
-
-    Parameters
-    ----------
-    url : string
-
-    Returns
-    -------
-    list of JSON
-    """
-    with WaybackClient() as wayback:
-        versions = wayback.search(url, from_date=from_date)
-        try:
-            next(versions)
-        except StopIteration:
-            return False
-        else:
-            return True
-
-
-def output_results(statuses):
-    """
-    Output nicely formatted results.
-
-    Parameters
-    ----------
-    statuses: sequence of tuple of (str, bool)
-    """
-    healthy_links = 0
-    unhealthy_links = 0
-
-    logs = []
-    for (url, status) in statuses:
-        if status:
-            healthy_links += 1
-            status_text = '✔︎    Found'
-        else:
-            unhealthy_links += 1
-            status_text = '✘  Missing'
-
-        message = f'{status_text}: {url}'
-        print(message)
-        logs.append(message)
-
-    # At this point, everything is OK; we don't need breadcrumbs and other
-    # extra noise to come with the message we are about to send.
-    with sentry_sdk.configure_scope() as scope:
-        scope.clear()
-
-    if healthy_links + unhealthy_links == 0:
-        print('Failed to sampled any pages!')
-        sentry_sdk.capture_message('Failed to sampled any pages!')
-    else:
-        message = f'\nFound: {healthy_links} healthy links and {unhealthy_links} unhealthy links.'
-        print(message)
-        if unhealthy_links > 0:
-            log_string = '\n'.join(logs)
-            sentry_sdk.capture_message(f'{message}\n{log_string}')
-
-
-if __name__ == "__main__":
-    try:
-        print(f'Sampling {LINKS_TO_CHECK} pages from Web Monitoring API...')
-        links = sample_monitored_urls(LINKS_TO_CHECK)
-        from_date = datetime.now() - MAX_CAPTURE_AGE
-        print(f'Checking for captures in Wayback Machine...')
-        capture_statuses = ((url, wayback_has_captures(url, from_date))
-                            for url in links)
-        output_results(capture_statuses)
-    except db.MissingCredentials as error:
-        print(error, file=sys.stderr)
+if __name__ == '__main__':
+    main()
diff --git a/scripts/wm b/scripts/wm
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from web_monitoring.cli import main
+from web_monitoring.cli.cli import main
 
 
 if __name__ == '__main__':

diff --git a/scripts/wm-diffing-server b/scripts/wm-diffing-server
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
-import sys
-from web_monitoring.diffing_server import cli
+from web_monitoring.diff_server.server import cli
 
 
 if __name__ == '__main__':