From 127a9247789cea19a720a209be537bf9ad40522c Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Mon, 9 Dec 2019 10:40:59 -0800 Subject: [PATCH] Reorganize content of web_monitoring into directories (#528) This reorganizes the content of web_monitoring into a hierarchy of modules for easier management and comprehension. See the discussion in #206 for more. scripts/ # Stubs for things in web_monitoring/cli annotations_import ia_healthcheck wm wm-diffing-server web_monitoring/ tests/ [same as today] diff/ content_type.py differs.py diff_errors.py html_diff_render.py links_diff.py diff_server/ server.py cli/ cli.py ia_healthcheck.py ia_import.py annotations_import.py __init__.py _version.py utils.py db.py This also drops `filtering.py`, which was vestigial and no longer used. --- scripts/annotations_import | 164 +---------------- scripts/ia_healthcheck | 125 +------------ scripts/wm | 2 +- scripts/wm-diffing-server | 3 +- web_monitoring/cli/annotations_import.py | 165 ++++++++++++++++++ web_monitoring/{ => cli}/cli.py | 0 web_monitoring/cli/ia_healthcheck.py | 125 +++++++++++++ web_monitoring/{ => diff}/content_type.py | 0 web_monitoring/{ => diff}/diff_errors.py | 0 web_monitoring/{ => diff}/differs.py | 2 +- web_monitoring/{ => diff}/html_diff_render.py | 2 +- web_monitoring/{ => diff}/links_diff.py | 2 +- .../server.py} | 38 ++-- web_monitoring/filtering.py | 60 ------- web_monitoring/tests/test_cli.py | 4 +- web_monitoring/tests/test_differs.py | 2 +- .../tests/test_diffing_server_exc_handling.py | 8 +- web_monitoring/tests/test_html_diff.py | 4 +- .../tests/test_html_diff_validity.py | 4 +- web_monitoring/tests/test_links_diff.py | 4 +- 20 files changed, 330 insertions(+), 384 deletions(-) create mode 100644 web_monitoring/cli/annotations_import.py rename web_monitoring/{ => cli}/cli.py (100%) create mode 100644 web_monitoring/cli/ia_healthcheck.py rename web_monitoring/{ => diff}/content_type.py (100%) rename web_monitoring/{ => diff}/diff_errors.py (100%) rename web_monitoring/{ => diff}/differs.py (99%) rename web_monitoring/{ => diff}/html_diff_render.py (99%) rename web_monitoring/{ => diff}/links_diff.py (99%) rename web_monitoring/{diffing_server.py => diff_server/server.py} (94%) delete mode 100644 web_monitoring/filtering.py diff --git a/scripts/annotations_import b/scripts/annotations_import index dd1a71185..c21a52eb1 100755 --- a/scripts/annotations_import +++ b/scripts/annotations_import @@ -1,168 +1,6 @@ #!/usr/bin/env python -import csv -from docopt import docopt -import logging -import os -import re -from tqdm import tqdm -from web_monitoring import db +from web_monitoring.cli.annotations_import import main -logger = logging.getLogger(__name__) -log_level = os.getenv('LOG_LEVEL', 'WARNING') -logger.setLevel(logging.__dict__[log_level]) - -class DictReaderStrip(csv.DictReader): - @property - def fieldnames(self): - return [name.strip() for name in super().fieldnames] - -def read_csv(csv_path): - with open(csv_path, newline='') as csvfile: - reader = DictReaderStrip(csvfile) - for row in reader: - yield row - -DIFF_URL_REGEX = re.compile(r'^.*/page/(.*)/(.*)\.\.(.*)') -def find_change_ids(csv_row): - diff_url = csv_row['Last Two - Side by Side'] - regex_result = DIFF_URL_REGEX.match(diff_url) - if regex_result: - (page_id, from_version_id, to_version_id) = regex_result.groups() - return {'page_id': page_id, - 'from_version_id': from_version_id, - 'to_version_id': to_version_id} - else: - return None - -class AnnotationAttributeInfo: - def __init__(self, column_names, json_key): - self.column_names = column_names - self.json_key = json_key - -class CsvSchemaError(Exception): - ... - -# If column names ever change while leaving the value semantics intact, -# add the new name to the correct list of column names here -BOOL_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [ - (['Language alteration'], - 'language_alteration'), - (['Link change/addition/removal'], - 'link_change'), - (['Repeated Change across many pages or a domain'], - 'repeated_change'), - (['Alteration within sections of a webpage'], - 'alteration_within_sections'), - (['Alteration, removal, or addition of entire section(s) of a webpage'], - 'alteration_entire_sections'), - (['Alteration, removal, or addition of an entire webpage or document'], - 'alteration_entire_webpage_or_document'), - (['Overhaul, removal, or addition of an entire website'], - 'alteration_entire_website'), - (['Alteration, removal, or addition of datasets'], - 'alteration_dataset')]] - -STRING_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [ - (['Is this primarily a content or access change (or both)?'], - 'content_or_access_change'), - (['Brief Description'], - 'brief_description'), - (['Topic 1'], - 'topic_1'), - (['Subtopic 1a'], - 'subtopic_1a'), - (['Subtopic 1b'], - 'subtopic_1b'), - (['Topic 2'], - 'topic_2'), - (['Subtopic 2a'], - 'subtopic_2a'), - (['Subtopic 2b'], - 'subtopic_2b'), - (['Topic 3'], - 'topic_3'), - (['Subtopic 3a'], - 'subtopic_3a'), - (['Subtopic 3b'], - 'subtopic_3b'), - (['Any keywords to monitor (e.g. for term analyses)?'], - 'keywords_to_monitor'), - (['Further Notes'], - 'further_notes'), - (['Ask/tell other working groups?'], - 'ask_tell_other_working_groups'), - - # Including this so that we can eventually map it to - # users in the database - (['Who Found This?'], - 'annotation_author')]] - -def get_attribute_value(attribute_info, csv_row): - for column_name in attribute_info.column_names: - if column_name in csv_row: - return csv_row[column_name].strip() - - # Despite being raised in a row-level function, this error means that the - # whole sheet is missing a column, so we don't catch and allow it to crash - raise CsvSchemaError(f'Expected to find one of {attribute_info.column_names} ' - f'in {csv_row.keys()}') - -def create_annotation(csv_row, is_important_changes): - annotation = {} - - for attribute_info in BOOL_ANNOTATION_ATTRIBUTES: - attribute_value = get_attribute_value(attribute_info, csv_row) - annotation[attribute_info.json_key] = attribute_value == '1' - for attribute_info in STRING_ANNOTATION_ATTRIBUTES: - attribute_value = get_attribute_value(attribute_info, csv_row) - annotation[attribute_info.json_key] = attribute_value - - # This will need additional logic to determine the actual sheet schema - annotation['annotation_schema'] = 'edgi_analyst_v2' - - significance = 0.0 - if is_important_changes: - importance_significance_mapping = { - 'low': 0.5, - 'medium': 0.75, - 'high': 1.0 - } - row_importance = csv_row['Importance?'].lower().strip() - significance = importance_significance_mapping.get(row_importance, 0.0) - annotation['significance'] = significance - - return annotation - -def main(): - doc = """Add analyst annotations from a csv file to the Web Monitoring db. - -Usage: -path/to/annotations_import.py [--is_important_changes] - -Options: ---is_important_changes Was this CSV generated from an Important Changes sheet? -""" - arguments = docopt(doc) - is_important_changes = arguments['--is_important_changes'] - csv_path = arguments[''] - - client = db.Client.from_env() - # Missing step: Analyze CSV to determine spreadsheet schema version - for row in tqdm(read_csv(csv_path), unit=' rows'): - change_ids = find_change_ids(row) - annotation = create_annotation(row, is_important_changes) - if not change_ids: - logger.warning(f'failed to extract IDs from {row}') - if not annotation: - logger.warning(f'failed to extract annotation data from {row}') - if change_ids and annotation: - try: - response = client.add_annotation(**change_ids, - annotation=annotation) - logger.debug(response) - except db.WebMonitoringDbError as e: - logger.warning( - f'failed to post annotation for row {row} with error: {e}') if __name__ == '__main__': main() diff --git a/scripts/ia_healthcheck b/scripts/ia_healthcheck index 4c6351aa9..b0722f48f 100755 --- a/scripts/ia_healthcheck +++ b/scripts/ia_healthcheck @@ -1,125 +1,6 @@ #!/usr/bin/env python +from web_monitoring.cli.ia_healthcheck import main -# This script checks whether the Internet Archive's Wayback Machine has -# recent captures of the URLs we are tracking in the Web Monitoring Database. -# It works by taking a random sample of pages from the DB and using the CDX API -# to check that each has been captured at least once in the last few days. -from datetime import datetime, timedelta -import random -import sentry_sdk -import sys -from web_monitoring import db -from wayback import WaybackClient - - -# The current Sentry client truncates string values at 512 characters. It -# appears that monkey-patching this module global is only way to change it and -# that doing so is the intended method: -# https://github.com/getsentry/sentry-python/blob/5f9f7c469af16a731948a482ea162c2348800999/sentry_sdk/utils.py#L662-L664 -# That doesn't seem great, so I've asked about this on their forums: -# https://forum.sentry.io/t/some-stack-traces-are-truncated/7309/4 -sentry_sdk.utils.MAX_STRING_LENGTH = 2048 - - -MAX_CAPTURE_AGE = timedelta(hours=72) -LINKS_TO_CHECK = 10 - -# Sentry automatically instantiates with the `SENTRY_DSN` environment variable. -# If not set, all its methods will operate conveniently as no-ops. -sentry_sdk.init() - - -def sample_monitored_urls(sample_size): - """ - Get a random sample of `sample_size` URLs that are tracked in a Web - Monitoring DB instance. - - Returns - ------- - list of string - """ - client = db.Client.from_env() - page = client.list_pages(chunk=1, chunk_size=1, active=True, include_total=True) - url_count = page['meta']['total_results'] - return (get_page_url(client, index) - for index in random.sample(range(url_count), sample_size)) - - -def get_page_url(client, index): - return client.list_pages(chunk=index, chunk_size=1, active=True)['data'][0]['url'] - - -def wayback_has_captures(url, from_date=None): - """ - Determine whether the Wayback Machine has any recent captures of a URL. - - Parameters - ---------- - url : string - - Returns - ------- - list of JSON - """ - with WaybackClient() as wayback: - versions = wayback.search(url, from_date=from_date) - try: - next(versions) - except StopIteration: - return False - else: - return True - - -def output_results(statuses): - """ - Output nicely formatted results. - - Parameters - ---------- - statuses: sequence of tuple of (str, bool) - """ - healthy_links = 0 - unhealthy_links = 0 - - logs = [] - for (url, status) in statuses: - if status: - healthy_links += 1 - status_text = '✔︎ Found' - else: - unhealthy_links += 1 - status_text = '✘ Missing' - - message = f'{status_text}: {url}' - print(message) - logs.append(message) - - # At this point, everything is OK; we don't need breadcrumbs and other - # extra noise to come with the message we are about to send. - with sentry_sdk.configure_scope() as scope: - scope.clear() - - if healthy_links + unhealthy_links == 0: - print('Failed to sampled any pages!') - sentry_sdk.capture_message('Failed to sampled any pages!') - else: - message = f'\nFound: {healthy_links} healthy links and {unhealthy_links} unhealthy links.' - print(message) - if unhealthy_links > 0: - log_string = '\n'.join(logs) - sentry_sdk.capture_message(f'{message}\n{log_string}') - - -if __name__ == "__main__": - try: - print(f'Sampling {LINKS_TO_CHECK} pages from Web Monitoring API...') - links = sample_monitored_urls(LINKS_TO_CHECK) - from_date = datetime.now() - MAX_CAPTURE_AGE - print(f'Checking for captures in Wayback Machine...') - capture_statuses = ((url, wayback_has_captures(url, from_date)) - for url in links) - output_results(capture_statuses) - except db.MissingCredentials as error: - print(error, file=sys.stderr) +if __name__ == '__main__': + main() diff --git a/scripts/wm b/scripts/wm index 3b8f06086..d794b7016 100755 --- a/scripts/wm +++ b/scripts/wm @@ -1,5 +1,5 @@ #!/usr/bin/env python -from web_monitoring.cli import main +from web_monitoring.cli.cli import main if __name__ == '__main__': diff --git a/scripts/wm-diffing-server b/scripts/wm-diffing-server index 5cb8d6c90..b02b606f9 100755 --- a/scripts/wm-diffing-server +++ b/scripts/wm-diffing-server @@ -1,6 +1,5 @@ #!/usr/bin/env python -import sys -from web_monitoring.diffing_server import cli +from web_monitoring.diff_server.server import cli if __name__ == '__main__': diff --git a/web_monitoring/cli/annotations_import.py b/web_monitoring/cli/annotations_import.py new file mode 100644 index 000000000..008d03ca0 --- /dev/null +++ b/web_monitoring/cli/annotations_import.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python +import csv +from docopt import docopt +import logging +import os +import re +from tqdm import tqdm +from web_monitoring import db + +logger = logging.getLogger(__name__) +log_level = os.getenv('LOG_LEVEL', 'WARNING') +logger.setLevel(logging.__dict__[log_level]) + +class DictReaderStrip(csv.DictReader): + @property + def fieldnames(self): + return [name.strip() for name in super().fieldnames] + +def read_csv(csv_path): + with open(csv_path, newline='') as csvfile: + reader = DictReaderStrip(csvfile) + for row in reader: + yield row + +DIFF_URL_REGEX = re.compile(r'^.*/page/(.*)/(.*)\.\.(.*)') +def find_change_ids(csv_row): + diff_url = csv_row['Last Two - Side by Side'] + regex_result = DIFF_URL_REGEX.match(diff_url) + if regex_result: + (page_id, from_version_id, to_version_id) = regex_result.groups() + return {'page_id': page_id, + 'from_version_id': from_version_id, + 'to_version_id': to_version_id} + else: + return None + +class AnnotationAttributeInfo: + def __init__(self, column_names, json_key): + self.column_names = column_names + self.json_key = json_key + +class CsvSchemaError(Exception): + ... + +# If column names ever change while leaving the value semantics intact, +# add the new name to the correct list of column names here +BOOL_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [ + (['Language alteration'], + 'language_alteration'), + (['Link change/addition/removal'], + 'link_change'), + (['Repeated Change across many pages or a domain'], + 'repeated_change'), + (['Alteration within sections of a webpage'], + 'alteration_within_sections'), + (['Alteration, removal, or addition of entire section(s) of a webpage'], + 'alteration_entire_sections'), + (['Alteration, removal, or addition of an entire webpage or document'], + 'alteration_entire_webpage_or_document'), + (['Overhaul, removal, or addition of an entire website'], + 'alteration_entire_website'), + (['Alteration, removal, or addition of datasets'], + 'alteration_dataset')]] + +STRING_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [ + (['Is this primarily a content or access change (or both)?'], + 'content_or_access_change'), + (['Brief Description'], + 'brief_description'), + (['Topic 1'], + 'topic_1'), + (['Subtopic 1a'], + 'subtopic_1a'), + (['Subtopic 1b'], + 'subtopic_1b'), + (['Topic 2'], + 'topic_2'), + (['Subtopic 2a'], + 'subtopic_2a'), + (['Subtopic 2b'], + 'subtopic_2b'), + (['Topic 3'], + 'topic_3'), + (['Subtopic 3a'], + 'subtopic_3a'), + (['Subtopic 3b'], + 'subtopic_3b'), + (['Any keywords to monitor (e.g. for term analyses)?'], + 'keywords_to_monitor'), + (['Further Notes'], + 'further_notes'), + (['Ask/tell other working groups?'], + 'ask_tell_other_working_groups'), + + # Including this so that we can eventually map it to + # users in the database + (['Who Found This?'], + 'annotation_author')]] + +def get_attribute_value(attribute_info, csv_row): + for column_name in attribute_info.column_names: + if column_name in csv_row: + return csv_row[column_name].strip() + + # Despite being raised in a row-level function, this error means that the + # whole sheet is missing a column, so we don't catch and allow it to crash + raise CsvSchemaError(f'Expected to find one of {attribute_info.column_names} ' + f'in {csv_row.keys()}') + +def create_annotation(csv_row, is_important_changes): + annotation = {} + + for attribute_info in BOOL_ANNOTATION_ATTRIBUTES: + attribute_value = get_attribute_value(attribute_info, csv_row) + annotation[attribute_info.json_key] = attribute_value == '1' + for attribute_info in STRING_ANNOTATION_ATTRIBUTES: + attribute_value = get_attribute_value(attribute_info, csv_row) + annotation[attribute_info.json_key] = attribute_value + + # This will need additional logic to determine the actual sheet schema + annotation['annotation_schema'] = 'edgi_analyst_v2' + + significance = 0.0 + if is_important_changes: + importance_significance_mapping = { + 'low': 0.5, + 'medium': 0.75, + 'high': 1.0 + } + row_importance = csv_row['Importance?'].lower().strip() + significance = importance_significance_mapping.get(row_importance, 0.0) + annotation['significance'] = significance + + return annotation + +def main(): + doc = """Add analyst annotations from a csv file to the Web Monitoring db. + +Usage: +path/to/annotations_import.py [--is_important_changes] + +Options: +--is_important_changes Was this CSV generated from an Important Changes sheet? +""" + arguments = docopt(doc) + is_important_changes = arguments['--is_important_changes'] + csv_path = arguments[''] + + client = db.Client.from_env() + # Missing step: Analyze CSV to determine spreadsheet schema version + for row in tqdm(read_csv(csv_path), unit=' rows'): + change_ids = find_change_ids(row) + annotation = create_annotation(row, is_important_changes) + if not change_ids: + logger.warning(f'failed to extract IDs from {row}') + if not annotation: + logger.warning(f'failed to extract annotation data from {row}') + if change_ids and annotation: + try: + response = client.add_annotation(**change_ids, + annotation=annotation) + logger.debug(response) + except db.WebMonitoringDbError as e: + logger.warning( + f'failed to post annotation for row {row} with error: {e}') diff --git a/web_monitoring/cli.py b/web_monitoring/cli/cli.py similarity index 100% rename from web_monitoring/cli.py rename to web_monitoring/cli/cli.py diff --git a/web_monitoring/cli/ia_healthcheck.py b/web_monitoring/cli/ia_healthcheck.py new file mode 100644 index 000000000..ace46095f --- /dev/null +++ b/web_monitoring/cli/ia_healthcheck.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python + +# This script checks whether the Internet Archive's Wayback Machine has +# recent captures of the URLs we are tracking in the Web Monitoring Database. +# It works by taking a random sample of pages from the DB and using the CDX API +# to check that each has been captured at least once in the last few days. + +from datetime import datetime, timedelta +import random +import sentry_sdk +import sys +from wayback import WaybackClient +from .. import db + + +# The current Sentry client truncates string values at 512 characters. It +# appears that monkey-patching this module global is only way to change it and +# that doing so is the intended method: +# https://github.com/getsentry/sentry-python/blob/5f9f7c469af16a731948a482ea162c2348800999/sentry_sdk/utils.py#L662-L664 +# That doesn't seem great, so I've asked about this on their forums: +# https://forum.sentry.io/t/some-stack-traces-are-truncated/7309/4 +sentry_sdk.utils.MAX_STRING_LENGTH = 2048 + + +MAX_CAPTURE_AGE = timedelta(hours=72) +LINKS_TO_CHECK = 10 + +# Sentry automatically instantiates with the `SENTRY_DSN` environment variable. +# If not set, all its methods will operate conveniently as no-ops. +sentry_sdk.init() + + +def sample_monitored_urls(sample_size): + """ + Get a random sample of `sample_size` URLs that are tracked in a Web + Monitoring DB instance. + + Returns + ------- + list of string + """ + client = db.Client.from_env() + page = client.list_pages(chunk=1, chunk_size=1, active=True, include_total=True) + url_count = page['meta']['total_results'] + return (get_page_url(client, index) + for index in random.sample(range(url_count), sample_size)) + + +def get_page_url(client, index): + return client.list_pages(chunk=index, chunk_size=1, active=True)['data'][0]['url'] + + +def wayback_has_captures(url, from_date=None): + """ + Determine whether the Wayback Machine has any recent captures of a URL. + + Parameters + ---------- + url : string + + Returns + ------- + list of JSON + """ + with WaybackClient() as wayback: + versions = wayback.search(url, from_date=from_date) + try: + next(versions) + except StopIteration: + return False + else: + return True + + +def output_results(statuses): + """ + Output nicely formatted results. + + Parameters + ---------- + statuses: sequence of tuple of (str, bool) + """ + healthy_links = 0 + unhealthy_links = 0 + + logs = [] + for (url, status) in statuses: + if status: + healthy_links += 1 + status_text = '✔︎ Found' + else: + unhealthy_links += 1 + status_text = '✘ Missing' + + message = f'{status_text}: {url}' + print(message) + logs.append(message) + + # At this point, everything is OK; we don't need breadcrumbs and other + # extra noise to come with the message we are about to send. + with sentry_sdk.configure_scope() as scope: + scope.clear() + + if healthy_links + unhealthy_links == 0: + print('Failed to sampled any pages!') + sentry_sdk.capture_message('Failed to sampled any pages!') + else: + message = f'\nFound: {healthy_links} healthy links and {unhealthy_links} unhealthy links.' + print(message) + if unhealthy_links > 0: + log_string = '\n'.join(logs) + sentry_sdk.capture_message(f'{message}\n{log_string}') + + +def main(): + try: + print(f'Sampling {LINKS_TO_CHECK} pages from Web Monitoring API...') + links = sample_monitored_urls(LINKS_TO_CHECK) + from_date = datetime.now() - MAX_CAPTURE_AGE + print(f'Checking for captures in Wayback Machine...') + capture_statuses = ((url, wayback_has_captures(url, from_date)) + for url in links) + output_results(capture_statuses) + except db.MissingCredentials as error: + print(error, file=sys.stderr) diff --git a/web_monitoring/content_type.py b/web_monitoring/diff/content_type.py similarity index 100% rename from web_monitoring/content_type.py rename to web_monitoring/diff/content_type.py diff --git a/web_monitoring/diff_errors.py b/web_monitoring/diff/diff_errors.py similarity index 100% rename from web_monitoring/diff_errors.py rename to web_monitoring/diff/diff_errors.py diff --git a/web_monitoring/differs.py b/web_monitoring/diff/differs.py similarity index 99% rename from web_monitoring/differs.py rename to web_monitoring/diff/differs.py index 41f7b47da..44ef9f7e5 100644 --- a/web_monitoring/differs.py +++ b/web_monitoring/diff/differs.py @@ -1,6 +1,6 @@ from bs4 import Comment from diff_match_patch import diff, diff_bytes -from web_monitoring.utils import get_color_palette +from ..utils import get_color_palette from htmldiffer.diff import HTMLDiffer import htmltreediff import html5_parser diff --git a/web_monitoring/html_diff_render.py b/web_monitoring/diff/html_diff_render.py similarity index 99% rename from web_monitoring/html_diff_render.py rename to web_monitoring/diff/html_diff_render.py index 00e15f2d1..d6fe0606a 100644 --- a/web_monitoring/html_diff_render.py +++ b/web_monitoring/diff/html_diff_render.py @@ -20,7 +20,7 @@ from functools import lru_cache import copy import difflib -from web_monitoring.utils import get_color_palette +from ..utils import get_color_palette import html import html5_parser import logging diff --git a/web_monitoring/links_diff.py b/web_monitoring/diff/links_diff.py similarity index 99% rename from web_monitoring/links_diff.py rename to web_monitoring/diff/links_diff.py index 0b053079c..634d8873b 100644 --- a/web_monitoring/links_diff.py +++ b/web_monitoring/diff/links_diff.py @@ -1,7 +1,7 @@ import html5_parser from .content_type import raise_if_not_diffable_html from .differs import compute_dmp_diff -from web_monitoring.utils import get_color_palette +from ..utils import get_color_palette from difflib import SequenceMatcher from .html_diff_render import (get_title, _html_for_dmp_operation, undiffable_content_tags) diff --git a/web_monitoring/diffing_server.py b/web_monitoring/diff_server/server.py similarity index 94% rename from web_monitoring/diffing_server.py rename to web_monitoring/diff_server/server.py index 6892dc157..468a54639 100644 --- a/web_monitoring/diffing_server.py +++ b/web_monitoring/diff_server/server.py @@ -14,10 +14,8 @@ import tornado.web import traceback import web_monitoring -import web_monitoring.differs -from web_monitoring.diff_errors import UndiffableContentError, UndecodableContentError -import web_monitoring.html_diff_render -import web_monitoring.links_diff +from ..diff import differs, html_diff_render, links_diff +from ..diff.diff_errors import UndiffableContentError, UndecodableContentError # Track errors with Sentry.io. It will automatically detect the `SENTRY_DSN` # environment variable. If not set, all its methods will operate conveniently @@ -32,26 +30,26 @@ # Map tokens in the REST API to functions in modules. # The modules do not have to be part of the web_monitoring package. DIFF_ROUTES = { - "length": web_monitoring.differs.compare_length, - "identical_bytes": web_monitoring.differs.identical_bytes, - "side_by_side_text": web_monitoring.differs.side_by_side_text, - "links": web_monitoring.links_diff.links_diff_html, - "links_json": web_monitoring.links_diff.links_diff_json, + "length": differs.compare_length, + "identical_bytes": differs.identical_bytes, + "side_by_side_text": differs.side_by_side_text, + "links": links_diff.links_diff_html, + "links_json": links_diff.links_diff_json, # applying diff-match-patch (dmp) to strings (no tokenization) - "html_text_dmp": web_monitoring.differs.html_text_diff, - "html_source_dmp": web_monitoring.differs.html_source_diff, + "html_text_dmp": differs.html_text_diff, + "html_source_dmp": differs.html_source_diff, # three different approaches to the same goal: - "html_token": web_monitoring.html_diff_render.html_diff_render, - "html_tree": web_monitoring.differs.html_tree_diff, - "html_perma_cc": web_monitoring.differs.html_differ, + "html_token": html_diff_render.html_diff_render, + "html_tree": differs.html_tree_diff, + "html_perma_cc": differs.html_differ, # deprecated synonyms - "links_diff": web_monitoring.links_diff.links_diff, - "html_text_diff": web_monitoring.differs.html_text_diff, - "html_source_diff": web_monitoring.differs.html_source_diff, - "html_visual_diff": web_monitoring.html_diff_render.html_diff_render, - "html_tree_diff": web_monitoring.differs.html_tree_diff, - "html_differ": web_monitoring.differs.html_differ, + "links_diff": links_diff.links_diff, + "html_text_diff": differs.html_text_diff, + "html_source_diff": differs.html_source_diff, + "html_visual_diff": html_diff_render.html_diff_render, + "html_tree_diff": differs.html_tree_diff, + "html_differ": differs.html_differ, } # Matches a tag in HTML used to specify the character encoding: diff --git a/web_monitoring/filtering.py b/web_monitoring/filtering.py deleted file mode 100644 index 536738138..000000000 --- a/web_monitoring/filtering.py +++ /dev/null @@ -1,60 +0,0 @@ -from urllib.parse import urlparse -import html5_parser - - -day_list = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] -month_list = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] -tag_list = ['