Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Seed-level video capture setting handling + Job-level PDF-only option #288

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
15 changes: 0 additions & 15 deletions brozzler/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,27 +544,12 @@ def dump_state(signum, frame):
finally:
signal.signal(signal.SIGQUIT, dump_state)

def get_skip_av_seeds():
# TODO: develop UI and refactor
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
try:
# make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(l) for l in skips.readlines()}
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
except Exception as e:
skip_av_seeds = set()
logging.info("running with empty skip_av_seeds")
return skip_av_seeds

rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr)
skip_av_seeds_from_file = get_skip_av_seeds()
worker = brozzler.worker.BrozzlerWorker(
frontier,
service_registry,
skip_av_seeds=skip_av_seeds_from_file,
max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe,
proxy=args.proxy,
Expand Down
47 changes: 36 additions & 11 deletions brozzler/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import uuid
import yaml
import zlib
from enum import Enum
from typing import Optional


Expand Down Expand Up @@ -100,6 +101,8 @@ def new_job(frontier, job_conf):
job.id = job_conf["id"]
if "max_claimed_sites" in job_conf:
job.max_claimed_sites = job_conf["max_claimed_sites"]
if "pdfs_only" in job_conf:
job.pdfs_only = job_conf["pdfs_only"]
job.save()

sites = []
Expand Down Expand Up @@ -198,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
def populate_defaults(self):
if not "status" in self:
self.status = "ACTIVE"
if "pdfs_only" not in self:
self.pdfs_only = False
if not "starts_and_stops" in self:
if self.get("started"): # backward compatibility
self.starts_and_stops = [
Expand All @@ -220,33 +225,53 @@ def finish(self):
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()


class VideoCaptureOptions(Enum):
"""
Enumeration of possible values for the `video_capture` config key.
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
containing the word "video" is not captured.
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.

Note: Ensuring full video MIME type blocking requires an additional entry in the
Warcprox-Meta header `mime-type-filters` key.
"""

ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"


class Site(doublethink.Document, ElapsedMixIn):
logger = logging.getLogger(__module__ + "." + __qualname__)
table = "sites"

def populate_defaults(self):
if not "status" in self:
if "status" not in self:
self.status = "ACTIVE"
if not "claimed" in self:
if "claimed" not in self:
self.claimed = False
if not "last_disclaimed" in self:
if "last_disclaimed" not in self:
self.last_disclaimed = brozzler.EPOCH_UTC
if not "last_claimed" in self:
if "last_claimed" not in self:
self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self:
if "scope" not in self:
self.scope = {}
if not "skip_ytdlp" in self:
self.skip_ytdlp = None
if "video_capture" not in self:
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value

# backward compatibility
if "surt" in self.scope:
if not "accepts" in self.scope:
if "accepts" not in self.scope:
self.scope["accepts"] = []
self.scope["accepts"].append({"surt": self.scope["surt"]})
del self.scope["surt"]

# backward compatibility
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
if "max_hops_off_surt" in self.scope:
del self.scope["max_hops_off_surt"]
Expand All @@ -256,7 +281,7 @@ def populate_defaults(self):
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
)

if not "starts_and_stops" in self:
if "starts_and_stops" not in self:
if self.get("start_time"): # backward compatibility
self.starts_and_stops = [
{"start": self.get("start_time"), "stop": None}
Expand All @@ -271,7 +296,7 @@ def __str__(self):
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)

def _accept_ssurt_if_not_redundant(self, ssurt):
if not "accepts" in self.scope:
if "accepts" not in self.scope:
self.scope["accepts"] = []
simple_rule_ssurts = (
rule["ssurt"]
Expand Down
43 changes: 34 additions & 9 deletions brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import logging
import brozzler
import brozzler.browser
from brozzler.model import VideoCaptureOptions
import threading
import time
import urllib.request
Expand Down Expand Up @@ -54,7 +55,6 @@ def __init__(
self,
frontier,
service_registry=None,
skip_av_seeds=None,
max_browsers=1,
chrome_exe="chromium-browser",
warcprox_auto=False,
Expand All @@ -74,7 +74,6 @@ def __init__(
):
self._frontier = frontier
self._service_registry = service_registry
self._skip_av_seeds = skip_av_seeds
self._max_browsers = max_browsers

self._warcprox_auto = warcprox_auto
Expand Down Expand Up @@ -250,7 +249,17 @@ def brozzle_page(

if not self._needs_browsing(page_headers):
self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page)
if site.video_capture in [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with if and elif, shouldn't pdfs_only should get checked first?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should have the correct behavior either way.

The first conditional is checking that 1) there is a rule to limit video capture by MIME type AND 2) that the MIME type specified in the Content-Type header of the response indicates that it's a video. Both have to be true to enter that branch, so we aren't going to accidentally skip the PDF conditional if Content-Type is a PDF.

The second conditional is similarly constrained on the PDF-only option being set and the Content-Type header specifying that the response is a PDF.

I would not expect the order to matter, but maybe I'm missing something?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks! I did miss self._is_video_type(page_headers) initially.

I still like starting with the pdfs_only check — it seems simpler, and encompasses any limiting VideoCaptureOptions enabled.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Swapped them.

VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
] and self._is_video_type(page_headers):
self.logger.info(
"skipping video content: video MIME type capture disabled for site"
)
elif site.pdfs_only and not self._is_pdf(page_headers):
self.logger.info("skipping non-PDF content: PDFs only option enabled")
else:
self._fetch_url(site, page=page)
else:
self.logger.info("needs browsing: %s", page)
try:
Expand All @@ -262,7 +271,7 @@ def brozzle_page(
self.logger.info("page interstitial shown (http auth): %s", page)

if enable_youtube_dl and ydl.should_ytdlp(
site, page, browser.websock_thread.page_status, self._skip_av_seeds
site, page, browser.websock_thread.page_status
):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
Expand Down Expand Up @@ -303,13 +312,29 @@ def _get_page_headers(self, page):
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
return {}

def _needs_browsing(self, page_headers):
if (
def _needs_browsing(self, page_headers) -> bool:
return not bool(
"content-type" in page_headers
and "html" not in page_headers["content-type"]
):
return False
return True
)

def _is_video_type(self, page_headers) -> bool:
"""
Determines if the page's Content-Type header specifies that it contains
a video.
"""
return (
"content-type" in page_headers and "video" in page_headers["content-type"]
)

def _is_pdf(self, page_headers) -> bool:
"""
Determinse if the page's Content-Type header specifies that it is a PDF.
gretchenleighmiller marked this conversation as resolved.
Show resolved Hide resolved
"""
return (
"content-type" in page_headers
and "application/pdf" in page_headers["content-type"]
)

def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def _on_screenshot(screenshot_jpeg):
Expand Down
25 changes: 7 additions & 18 deletions brozzler/ydl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import yt_dlp
from yt_dlp.utils import match_filter_func
import brozzler
from brozzler.model import VideoCaptureOptions
import urllib.request
import tempfile
import urlcanon
Expand All @@ -32,36 +33,24 @@
thread_local = threading.local()


def should_ytdlp(site, page, page_status, skip_av_seeds):
def should_ytdlp(site, page, page_status):
# called only after we've passed needs_browsing() check

if page_status != 200:
logging.info("skipping ytdlp: non-200 page status %s", page_status)
return False
if site.skip_ytdlp:
logging.info("skipping ytdlp: site marked skip_ytdlp")
if site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
]:
logging.info("skipping ytdlp: site has video capture disabled")
return False

ytdlp_url = page.redirect_url if page.redirect_url else page.url

if "chrome-error:" in ytdlp_url:
return False

ytdlp_seed = (
site["metadata"]["ait_seed_id"]
if "metadata" in site and "ait_seed_id" in site["metadata"]
else None
)

# TODO: develop UI and refactor
if ytdlp_seed:
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
logging.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False
else:
site.skip_ytdlp = False

return True


Expand Down
31 changes: 31 additions & 0 deletions job-conf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,18 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
simultaneously across the cluster. Addresses the problem of a job with many
seeds starving out other jobs.

``pdfs_only``
galgeek marked this conversation as resolved.
Show resolved Hide resolved
~~~~~~~~~~~~~~~~~~~~~
+---------+----------+-----------+
| type | required | default |
+=========+==========+===========+
| boolean | no | ``false`` |
+---------+----------+-----------+
Limits capture to PDFs based on MIME type. This value will only impact
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current code changes only processing of page-level urls with PDF-ish content-type.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you suggesting a better way of wording this in the documentation? It is called out here that this option specifically has limited impact and must be paired with a warcprox_meta rule to further filter by MIME type.

Do you think "PDF-ish" is better phrasing given the ambiguity of MIME type in Content-Type headers? I would expect MIME type to be pretty straightforward in this case given that PDFs are a pretty well-defined and well-known format.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something like this would be better, I think:

"Limits captures to PDFs, based on page's content-type header.
This value, for now, affects only brozzler's capture of page-level urls.

Note: fully limiting a crawl to PDFs only will also require updates to brozzler's Warcprox-Meta header and warcprox."

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reworded slightly for clarity.

processing of outlinks within Brozzler. Fully limiting a crawl to only PDFs
requires an additional entry in the Warcprox-Meta header ``mime-type-filters``
key to fully block videos by MIME type.
gretchenleighmiller marked this conversation as resolved.
Show resolved Hide resolved

``seeds``
~~~~~~~~~
+------------------------+----------+---------+
Expand Down Expand Up @@ -158,6 +170,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
the default values in place. Brozzler submits login forms after page load.
Then brozzling proceeds as usual.

``video_capture``
~~~~~~~~~~~~~~~~~
+--------+----------+--------------------------+
| type | required | default |
+========+==========+==========================+
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
+--------+----------+--------------------------+
Determines the level of video capture for the seed. This is an enumeration with four possible values:

* ENABLE_VIDEO_CAPTURE (default): All video is captured.
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
the word "video" is not captured.
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.

*Note: Ensuring full video MIME type blocking requires an additional entry in
the Warcprox-Meta header `mime-type-filters` key.*

Seed-level / top-level settings
-------------------------------
These are seed settings that can also be specified at the top level, in which
Expand Down