Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reorganize content of web_monitoring into directories #528

Merged
merged 6 commits into from
Dec 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 1 addition & 163 deletions scripts/annotations_import
Original file line number Diff line number Diff line change
@@ -1,168 +1,6 @@
#!/usr/bin/env python
import csv
from docopt import docopt
import logging
import os
import re
from tqdm import tqdm
from web_monitoring import db
from web_monitoring.cli.annotations_import import main

logger = logging.getLogger(__name__)
log_level = os.getenv('LOG_LEVEL', 'WARNING')
logger.setLevel(logging.__dict__[log_level])

class DictReaderStrip(csv.DictReader):
@property
def fieldnames(self):
return [name.strip() for name in super().fieldnames]

def read_csv(csv_path):
with open(csv_path, newline='') as csvfile:
reader = DictReaderStrip(csvfile)
for row in reader:
yield row

DIFF_URL_REGEX = re.compile(r'^.*/page/(.*)/(.*)\.\.(.*)')
def find_change_ids(csv_row):
diff_url = csv_row['Last Two - Side by Side']
regex_result = DIFF_URL_REGEX.match(diff_url)
if regex_result:
(page_id, from_version_id, to_version_id) = regex_result.groups()
return {'page_id': page_id,
'from_version_id': from_version_id,
'to_version_id': to_version_id}
else:
return None

class AnnotationAttributeInfo:
def __init__(self, column_names, json_key):
self.column_names = column_names
self.json_key = json_key

class CsvSchemaError(Exception):
...

# If column names ever change while leaving the value semantics intact,
# add the new name to the correct list of column names here
BOOL_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [
(['Language alteration'],
'language_alteration'),
(['Link change/addition/removal'],
'link_change'),
(['Repeated Change across many pages or a domain'],
'repeated_change'),
(['Alteration within sections of a webpage'],
'alteration_within_sections'),
(['Alteration, removal, or addition of entire section(s) of a webpage'],
'alteration_entire_sections'),
(['Alteration, removal, or addition of an entire webpage or document'],
'alteration_entire_webpage_or_document'),
(['Overhaul, removal, or addition of an entire website'],
'alteration_entire_website'),
(['Alteration, removal, or addition of datasets'],
'alteration_dataset')]]

STRING_ANNOTATION_ATTRIBUTES = [AnnotationAttributeInfo(*info) for info in [
(['Is this primarily a content or access change (or both)?'],
'content_or_access_change'),
(['Brief Description'],
'brief_description'),
(['Topic 1'],
'topic_1'),
(['Subtopic 1a'],
'subtopic_1a'),
(['Subtopic 1b'],
'subtopic_1b'),
(['Topic 2'],
'topic_2'),
(['Subtopic 2a'],
'subtopic_2a'),
(['Subtopic 2b'],
'subtopic_2b'),
(['Topic 3'],
'topic_3'),
(['Subtopic 3a'],
'subtopic_3a'),
(['Subtopic 3b'],
'subtopic_3b'),
(['Any keywords to monitor (e.g. for term analyses)?'],
'keywords_to_monitor'),
(['Further Notes'],
'further_notes'),
(['Ask/tell other working groups?'],
'ask_tell_other_working_groups'),

# Including this so that we can eventually map it to
# users in the database
(['Who Found This?'],
'annotation_author')]]

def get_attribute_value(attribute_info, csv_row):
for column_name in attribute_info.column_names:
if column_name in csv_row:
return csv_row[column_name].strip()

# Despite being raised in a row-level function, this error means that the
# whole sheet is missing a column, so we don't catch and allow it to crash
raise CsvSchemaError(f'Expected to find one of {attribute_info.column_names} '
f'in {csv_row.keys()}')

def create_annotation(csv_row, is_important_changes):
annotation = {}

for attribute_info in BOOL_ANNOTATION_ATTRIBUTES:
attribute_value = get_attribute_value(attribute_info, csv_row)
annotation[attribute_info.json_key] = attribute_value == '1'
for attribute_info in STRING_ANNOTATION_ATTRIBUTES:
attribute_value = get_attribute_value(attribute_info, csv_row)
annotation[attribute_info.json_key] = attribute_value

# This will need additional logic to determine the actual sheet schema
annotation['annotation_schema'] = 'edgi_analyst_v2'

significance = 0.0
if is_important_changes:
importance_significance_mapping = {
'low': 0.5,
'medium': 0.75,
'high': 1.0
}
row_importance = csv_row['Importance?'].lower().strip()
significance = importance_significance_mapping.get(row_importance, 0.0)
annotation['significance'] = significance

return annotation

def main():
doc = """Add analyst annotations from a csv file to the Web Monitoring db.

Usage:
path/to/annotations_import.py <csv_path> [--is_important_changes]

Options:
--is_important_changes Was this CSV generated from an Important Changes sheet?
"""
arguments = docopt(doc)
is_important_changes = arguments['--is_important_changes']
csv_path = arguments['<csv_path>']

client = db.Client.from_env()
# Missing step: Analyze CSV to determine spreadsheet schema version
for row in tqdm(read_csv(csv_path), unit=' rows'):
change_ids = find_change_ids(row)
annotation = create_annotation(row, is_important_changes)
if not change_ids:
logger.warning(f'failed to extract IDs from {row}')
if not annotation:
logger.warning(f'failed to extract annotation data from {row}')
if change_ids and annotation:
try:
response = client.add_annotation(**change_ids,
annotation=annotation)
logger.debug(response)
except db.WebMonitoringDbError as e:
logger.warning(
f'failed to post annotation for row {row} with error: {e}')

if __name__ == '__main__':
main()
125 changes: 3 additions & 122 deletions scripts/ia_healthcheck
Original file line number Diff line number Diff line change
@@ -1,125 +1,6 @@
#!/usr/bin/env python
from web_monitoring.cli.ia_healthcheck import main

# This script checks whether the Internet Archive's Wayback Machine has
# recent captures of the URLs we are tracking in the Web Monitoring Database.
# It works by taking a random sample of pages from the DB and using the CDX API
# to check that each has been captured at least once in the last few days.

from datetime import datetime, timedelta
import random
import sentry_sdk
import sys
from web_monitoring import db
from wayback import WaybackClient


# The current Sentry client truncates string values at 512 characters. It
# appears that monkey-patching this module global is only way to change it and
# that doing so is the intended method:
# https://github.com/getsentry/sentry-python/blob/5f9f7c469af16a731948a482ea162c2348800999/sentry_sdk/utils.py#L662-L664
# That doesn't seem great, so I've asked about this on their forums:
# https://forum.sentry.io/t/some-stack-traces-are-truncated/7309/4
sentry_sdk.utils.MAX_STRING_LENGTH = 2048


MAX_CAPTURE_AGE = timedelta(hours=72)
LINKS_TO_CHECK = 10

# Sentry automatically instantiates with the `SENTRY_DSN` environment variable.
# If not set, all its methods will operate conveniently as no-ops.
sentry_sdk.init()


def sample_monitored_urls(sample_size):
"""
Get a random sample of `sample_size` URLs that are tracked in a Web
Monitoring DB instance.

Returns
-------
list of string
"""
client = db.Client.from_env()
page = client.list_pages(chunk=1, chunk_size=1, active=True, include_total=True)
url_count = page['meta']['total_results']
return (get_page_url(client, index)
for index in random.sample(range(url_count), sample_size))


def get_page_url(client, index):
return client.list_pages(chunk=index, chunk_size=1, active=True)['data'][0]['url']


def wayback_has_captures(url, from_date=None):
"""
Determine whether the Wayback Machine has any recent captures of a URL.

Parameters
----------
url : string

Returns
-------
list of JSON
"""
with WaybackClient() as wayback:
versions = wayback.search(url, from_date=from_date)
try:
next(versions)
except StopIteration:
return False
else:
return True


def output_results(statuses):
"""
Output nicely formatted results.

Parameters
----------
statuses: sequence of tuple of (str, bool)
"""
healthy_links = 0
unhealthy_links = 0

logs = []
for (url, status) in statuses:
if status:
healthy_links += 1
status_text = '✔︎ Found'
else:
unhealthy_links += 1
status_text = '✘ Missing'

message = f'{status_text}: {url}'
print(message)
logs.append(message)

# At this point, everything is OK; we don't need breadcrumbs and other
# extra noise to come with the message we are about to send.
with sentry_sdk.configure_scope() as scope:
scope.clear()

if healthy_links + unhealthy_links == 0:
print('Failed to sampled any pages!')
sentry_sdk.capture_message('Failed to sampled any pages!')
else:
message = f'\nFound: {healthy_links} healthy links and {unhealthy_links} unhealthy links.'
print(message)
if unhealthy_links > 0:
log_string = '\n'.join(logs)
sentry_sdk.capture_message(f'{message}\n{log_string}')


if __name__ == "__main__":
try:
print(f'Sampling {LINKS_TO_CHECK} pages from Web Monitoring API...')
links = sample_monitored_urls(LINKS_TO_CHECK)
from_date = datetime.now() - MAX_CAPTURE_AGE
print(f'Checking for captures in Wayback Machine...')
capture_statuses = ((url, wayback_has_captures(url, from_date))
for url in links)
output_results(capture_statuses)
except db.MissingCredentials as error:
print(error, file=sys.stderr)
if __name__ == '__main__':
main()
2 changes: 1 addition & 1 deletion scripts/wm
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
from web_monitoring.cli import main
from web_monitoring.cli.cli import main


if __name__ == '__main__':
Expand Down
3 changes: 1 addition & 2 deletions scripts/wm-diffing-server
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python
import sys
from web_monitoring.diffing_server import cli
from web_monitoring.diff_server.server import cli


if __name__ == '__main__':
Expand Down
Loading