-
-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Reorganize content of web_monitoring into directories (#528)
This reorganizes the content of web_monitoring into a hierarchy of modules for easier management and comprehension. See the discussion in #206 for more. scripts/ # Stubs for things in web_monitoring/cli annotations_import ia_healthcheck wm wm-diffing-server web_monitoring/ tests/ [same as today] diff/ content_type.py differs.py diff_errors.py html_diff_render.py links_diff.py diff_server/ server.py cli/ cli.py ia_healthcheck.py ia_import.py annotations_import.py __init__.py _version.py utils.py db.py This also drops `filtering.py`, which was vestigial and no longer used.
- Loading branch information
Showing
20 changed files
with
330 additions
and
384 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,168 +1,6 @@ | ||
#!/usr/bin/env python | ||
import csv | ||
from docopt import docopt | ||
import logging | ||
import os | ||
import re | ||
from tqdm import tqdm | ||
from web_monitoring import db | ||
from web_monitoring.cli.annotations_import import main | ||
|
||
logger = logging.getLogger(__name__) | ||
log_level = os.getenv('LOG_LEVEL', 'WARNING') | ||
logger.setLevel(logging.__dict__[log_level]) | ||
|
||
class DictReaderStrip(csv.DictReader): | ||
@property | ||
def fieldnames(self): | ||
return [name.strip() for name in super().fieldnames] | ||
|
||
def read_csv(csv_path): | ||
with open(csv_path, newline='') as csvfile: | ||
reader = DictReaderStrip(csvfile) | ||
for row in reader: | ||
yield row | ||
|
||
DIFF_URL_REGEX = re.compile(r'^.*/page/(.*)/(.*)\.\.(.*)') | ||
def find_change_ids(csv_row): | ||
diff_url = csv_row['Last Two - Side by Side'] | ||
regex_result = DIFF_URL_REGEX.match(diff_url) | ||
if regex_result: | ||
(page_id, from_version_id, to_version_id) = regex_result.groups() | ||
return {'page_id': page_id, | ||
'from_version_id': from_version_id, | ||
'to_version_id': to_version_id} | ||
else: | ||
return None | ||
|
||
class AnnotationAttributeInfo: | ||
def __init__(self, column_names, json_key): | ||
self.column_names = column_names | ||
self.json_key = json_key | ||
|
||
class CsvSchemaError(Exception): | ||
... | ||
|
||
# If column names ever change while leaving the value semantics intact, | ||
# add the new name to the correct list of column names here | ||
BOOL_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [ | ||
(['Language alteration'], | ||
'language_alteration'), | ||
(['Link change/addition/removal'], | ||
'link_change'), | ||
(['Repeated Change across many pages or a domain'], | ||
'repeated_change'), | ||
(['Alteration within sections of a webpage'], | ||
'alteration_within_sections'), | ||
(['Alteration, removal, or addition of entire section(s) of a webpage'], | ||
'alteration_entire_sections'), | ||
(['Alteration, removal, or addition of an entire webpage or document'], | ||
'alteration_entire_webpage_or_document'), | ||
(['Overhaul, removal, or addition of an entire website'], | ||
'alteration_entire_website'), | ||
(['Alteration, removal, or addition of datasets'], | ||
'alteration_dataset')]] | ||
|
||
STRING_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [ | ||
(['Is this primarily a content or access change (or both)?'], | ||
'content_or_access_change'), | ||
(['Brief Description'], | ||
'brief_description'), | ||
(['Topic 1'], | ||
'topic_1'), | ||
(['Subtopic 1a'], | ||
'subtopic_1a'), | ||
(['Subtopic 1b'], | ||
'subtopic_1b'), | ||
(['Topic 2'], | ||
'topic_2'), | ||
(['Subtopic 2a'], | ||
'subtopic_2a'), | ||
(['Subtopic 2b'], | ||
'subtopic_2b'), | ||
(['Topic 3'], | ||
'topic_3'), | ||
(['Subtopic 3a'], | ||
'subtopic_3a'), | ||
(['Subtopic 3b'], | ||
'subtopic_3b'), | ||
(['Any keywords to monitor (e.g. for term analyses)?'], | ||
'keywords_to_monitor'), | ||
(['Further Notes'], | ||
'further_notes'), | ||
(['Ask/tell other working groups?'], | ||
'ask_tell_other_working_groups'), | ||
|
||
# Including this so that we can eventually map it to | ||
# users in the database | ||
(['Who Found This?'], | ||
'annotation_author')]] | ||
|
||
def get_attribute_value(attribute_info, csv_row): | ||
for column_name in attribute_info.column_names: | ||
if column_name in csv_row: | ||
return csv_row[column_name].strip() | ||
|
||
# Despite being raised in a row-level function, this error means that the | ||
# whole sheet is missing a column, so we don't catch and allow it to crash | ||
raise CsvSchemaError(f'Expected to find one of {attribute_info.column_names} ' | ||
f'in {csv_row.keys()}') | ||
|
||
def create_annotation(csv_row, is_important_changes): | ||
annotation = {} | ||
|
||
for attribute_info in BOOL_ANNOTATION_ATTRIBUTES: | ||
attribute_value = get_attribute_value(attribute_info, csv_row) | ||
annotation[attribute_info.json_key] = attribute_value == '1' | ||
for attribute_info in STRING_ANNOTATION_ATTRIBUTES: | ||
attribute_value = get_attribute_value(attribute_info, csv_row) | ||
annotation[attribute_info.json_key] = attribute_value | ||
|
||
# This will need additional logic to determine the actual sheet schema | ||
annotation['annotation_schema'] = 'edgi_analyst_v2' | ||
|
||
significance = 0.0 | ||
if is_important_changes: | ||
importance_significance_mapping = { | ||
'low': 0.5, | ||
'medium': 0.75, | ||
'high': 1.0 | ||
} | ||
row_importance = csv_row['Importance?'].lower().strip() | ||
significance = importance_significance_mapping.get(row_importance, 0.0) | ||
annotation['significance'] = significance | ||
|
||
return annotation | ||
|
||
def main(): | ||
doc = """Add analyst annotations from a csv file to the Web Monitoring db. | ||
Usage: | ||
path/to/annotations_import.py <csv_path> [--is_important_changes] | ||
Options: | ||
--is_important_changes Was this CSV generated from an Important Changes sheet? | ||
""" | ||
arguments = docopt(doc) | ||
is_important_changes = arguments['--is_important_changes'] | ||
csv_path = arguments['<csv_path>'] | ||
|
||
client = db.Client.from_env() | ||
# Missing step: Analyze CSV to determine spreadsheet schema version | ||
for row in tqdm(read_csv(csv_path), unit=' rows'): | ||
change_ids = find_change_ids(row) | ||
annotation = create_annotation(row, is_important_changes) | ||
if not change_ids: | ||
logger.warning(f'failed to extract IDs from {row}') | ||
if not annotation: | ||
logger.warning(f'failed to extract annotation data from {row}') | ||
if change_ids and annotation: | ||
try: | ||
response = client.add_annotation(**change_ids, | ||
annotation=annotation) | ||
logger.debug(response) | ||
except db.WebMonitoringDbError as e: | ||
logger.warning( | ||
f'failed to post annotation for row {row} with error: {e}') | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,125 +1,6 @@ | ||
#!/usr/bin/env python | ||
from web_monitoring.cli.ia_healthcheck import main | ||
|
||
# This script checks whether the Internet Archive's Wayback Machine has | ||
# recent captures of the URLs we are tracking in the Web Monitoring Database. | ||
# It works by taking a random sample of pages from the DB and using the CDX API | ||
# to check that each has been captured at least once in the last few days. | ||
|
||
from datetime import datetime, timedelta | ||
import random | ||
import sentry_sdk | ||
import sys | ||
from web_monitoring import db | ||
from wayback import WaybackClient | ||
|
||
|
||
# The current Sentry client truncates string values at 512 characters. It | ||
# appears that monkey-patching this module global is only way to change it and | ||
# that doing so is the intended method: | ||
# https://github.com/getsentry/sentry-python/blob/5f9f7c469af16a731948a482ea162c2348800999/sentry_sdk/utils.py#L662-L664 | ||
# That doesn't seem great, so I've asked about this on their forums: | ||
# https://forum.sentry.io/t/some-stack-traces-are-truncated/7309/4 | ||
sentry_sdk.utils.MAX_STRING_LENGTH = 2048 | ||
|
||
|
||
MAX_CAPTURE_AGE = timedelta(hours=72) | ||
LINKS_TO_CHECK = 10 | ||
|
||
# Sentry automatically instantiates with the `SENTRY_DSN` environment variable. | ||
# If not set, all its methods will operate conveniently as no-ops. | ||
sentry_sdk.init() | ||
|
||
|
||
def sample_monitored_urls(sample_size): | ||
""" | ||
Get a random sample of `sample_size` URLs that are tracked in a Web | ||
Monitoring DB instance. | ||
Returns | ||
------- | ||
list of string | ||
""" | ||
client = db.Client.from_env() | ||
page = client.list_pages(chunk=1, chunk_size=1, active=True, include_total=True) | ||
url_count = page['meta']['total_results'] | ||
return (get_page_url(client, index) | ||
for index in random.sample(range(url_count), sample_size)) | ||
|
||
|
||
def get_page_url(client, index): | ||
return client.list_pages(chunk=index, chunk_size=1, active=True)['data'][0]['url'] | ||
|
||
|
||
def wayback_has_captures(url, from_date=None): | ||
""" | ||
Determine whether the Wayback Machine has any recent captures of a URL. | ||
Parameters | ||
---------- | ||
url : string | ||
Returns | ||
------- | ||
list of JSON | ||
""" | ||
with WaybackClient() as wayback: | ||
versions = wayback.search(url, from_date=from_date) | ||
try: | ||
next(versions) | ||
except StopIteration: | ||
return False | ||
else: | ||
return True | ||
|
||
|
||
def output_results(statuses): | ||
""" | ||
Output nicely formatted results. | ||
Parameters | ||
---------- | ||
statuses: sequence of tuple of (str, bool) | ||
""" | ||
healthy_links = 0 | ||
unhealthy_links = 0 | ||
|
||
logs = [] | ||
for (url, status) in statuses: | ||
if status: | ||
healthy_links += 1 | ||
status_text = '✔︎ Found' | ||
else: | ||
unhealthy_links += 1 | ||
status_text = '✘ Missing' | ||
|
||
message = f'{status_text}: {url}' | ||
print(message) | ||
logs.append(message) | ||
|
||
# At this point, everything is OK; we don't need breadcrumbs and other | ||
# extra noise to come with the message we are about to send. | ||
with sentry_sdk.configure_scope() as scope: | ||
scope.clear() | ||
|
||
if healthy_links + unhealthy_links == 0: | ||
print('Failed to sampled any pages!') | ||
sentry_sdk.capture_message('Failed to sampled any pages!') | ||
else: | ||
message = f'\nFound: {healthy_links} healthy links and {unhealthy_links} unhealthy links.' | ||
print(message) | ||
if unhealthy_links > 0: | ||
log_string = '\n'.join(logs) | ||
sentry_sdk.capture_message(f'{message}\n{log_string}') | ||
|
||
|
||
if __name__ == "__main__": | ||
try: | ||
print(f'Sampling {LINKS_TO_CHECK} pages from Web Monitoring API...') | ||
links = sample_monitored_urls(LINKS_TO_CHECK) | ||
from_date = datetime.now() - MAX_CAPTURE_AGE | ||
print(f'Checking for captures in Wayback Machine...') | ||
capture_statuses = ((url, wayback_has_captures(url, from_date)) | ||
for url in links) | ||
output_results(capture_statuses) | ||
except db.MissingCredentials as error: | ||
print(error, file=sys.stderr) | ||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.