Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: PDF image optimization / compression #6

Merged
merged 14 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Changelog

## Version 1.5.0 (XXXX-XX-XX)

- Optionally scale down large images within quiz reports to preserve space and keep PDF files compact
- Optionally compress images within quiz reports to preserve space and keep PDF files compact
- Rename `REPORT_PREVENT_REDIRECT_TO_LOGIN` to `PREVENT_REDIRECT_TO_LOGIN` to reflect the naming of the environment variable
- Reduce noise from 3rd party library loggers on log level `DEBUG`


## Version 1.4.0 (2024-07-08)

- Prevent belated redirects away from attempt report page (e.g. to login page)
Expand Down
11 changes: 10 additions & 1 deletion archiveworker/custom_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class JobArchiveRequest:
Deserialized JSON request for creating an archive job
"""

API_VERSION = 5
API_VERSION = 6

PAPER_FORMATS = ['A0', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'Letter', 'Legal', 'Tabloid', 'Ledger']

Expand Down Expand Up @@ -173,6 +173,15 @@ def _validate_self(self):
return False
if not isinstance(self.tasks['archive_quiz_attempts']['filename_pattern'], str) or self.tasks['archive_quiz_attempts']['filename_pattern'] is None:
return False
if not isinstance(self.tasks['archive_quiz_attempts']['image_optimize'], object) and not self.tasks['archive_quiz_attempts']['image_optimize'] is False:
return False
if isinstance(self.tasks['archive_quiz_attempts']['image_optimize'], object) and self.tasks['archive_quiz_attempts']['image_optimize'] is not False:
if not isinstance(self.tasks['archive_quiz_attempts']['image_optimize']['width'], int) or self.tasks['archive_quiz_attempts']['image_optimize']['width'] < 1:
return False
if not isinstance(self.tasks['archive_quiz_attempts']['image_optimize']['height'], int) or self.tasks['archive_quiz_attempts']['image_optimize']['height'] < 1:
return False
if not isinstance(self.tasks['archive_quiz_attempts']['image_optimize']['quality'], int) or not 0 <= self.tasks['archive_quiz_attempts']['image_optimize']['quality'] <= 100:
return False

if self.tasks['archive_moodle_backups']:
if not isinstance(self.tasks['archive_moodle_backups'], List):
Expand Down
4 changes: 4 additions & 0 deletions archiveworker/moodle_quiz_archive_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,5 +187,9 @@ def run() -> None:
logging.basicConfig(encoding='utf-8', format='[%(asctime)s] | %(levelname)-8s | %(name)s | %(message)s', level=Config.LOG_LEVEL)
app.logger.info(f'Running {Config.APP_NAME} version {Config.VERSION} on log level {logging.getLevelName(Config.LOG_LEVEL)}')

# Reduce noise from 3rd party library loggers
if Config.LOG_LEVEL == logging.DEBUG:
logging.getLogger("PIL").setLevel('INFO')

start_processing_thread()
waitress.serve(app, host=Config.SERVER_HOST, port=Config.SERVER_PORT)
77 changes: 76 additions & 1 deletion archiveworker/quiz_archive_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
from uuid import UUID

import requests
from PIL.Image import Resampling
from playwright.async_api import async_playwright, ViewportSize, BrowserContext, Route
from pypdf import PdfWriter

from config import Config
from .custom_types import JobStatus, JobArchiveRequest, ReportSignal, BackupStatus
Expand Down Expand Up @@ -203,6 +205,14 @@ async def _process_quiz_attempts(self, attemptids: List[int], paper_format: str)
raise InterruptedError('Thread stop requested')
else:
await self._render_quiz_attempt(context, attemptid, paper_format)
if self.request.tasks['archive_quiz_attempts']['image_optimize']:
await self._compress_pdf(
file=Path(f"{self.workdir}/attempts/{self.archived_attempts[attemptid]}/{self.archived_attempts[attemptid]}.pdf"),
pdf_compression_level=6,
image_maxwidth=self.request.tasks['archive_quiz_attempts']['image_optimize']['width'],
image_maxheight=self.request.tasks['archive_quiz_attempts']['image_optimize']['height'],
image_quality=self.request.tasks['archive_quiz_attempts']['image_optimize']['quality']
)

await browser.close()
self.logger.debug("Destroyed playwright Browser and BrowserContext")
Expand Down Expand Up @@ -295,7 +305,7 @@ async def javascript_redirection_patcher(route: Route):
try:
# Register custom route handlers
await page.route(f"{self.request.moodle_base_url}/mock/attempt", mock_responder)
if Config.REPORT_PREVENT_REDIRECT_TO_LOGIN:
if Config.PREVENT_REDIRECT_TO_LOGIN:
await page.route('**/login/*.php', login_redirection_interceptor)
await page.route('**/*.js', javascript_redirection_patcher)

Expand Down Expand Up @@ -398,6 +408,71 @@ async def _wait_for_page_ready_signal(self, page) -> None:
cmsg = await cmsg_handler.value
self.logger.debug(f'Received signal: {cmsg}')

async def _compress_pdf(
self,
file: Path,
pdf_compression_level: int,
image_maxwidth: int,
image_maxheight: int,
image_quality: int
) -> None:
"""
Compresses a PDF file by resizing/compressing images and compressing content streams.
Replaces the given file.

:param file: Path to the PDF file to compress
:param pdf_compression_level: Compression level for content streams (0-9)
:param image_maxwidth: Maximum width of images in pixels
:param image_maxheight: Maximum height of images in pixels
:param image_quality: JPEG2000 compression quality (0-100)
:return: None
"""

# Dev notes:
# (1) Page content stream compression did not much in our tests, but it's basically free, so we keep it without
# making it configurable to the user for now.
# (2) Re-writing the whole file after compression, as suggested by pypdf, does change nothing for us, since it
# is already re-written during the image processing step.
# (3) By far the greatest size reduction is achieved scaling down huge images, if people upload high-res images.

old_filesize = os.path.getsize(file)
self.logger.debug(f"Compressing PDF file: {file} (size: {old_filesize} bytes)")
writer = PdfWriter(clone_from=file)

img_idx = 0
for page in writer.pages:
for img in page.images:
img_idx += 1

# Do not touch images with transparency data (mode=RGBA).
# See: https://github.com/python-pillow/Pillow/issues/8074
if img.image.has_transparency_data:
self.logger.debug(f" -> Skipping image {img_idx} on page {page.page_number} because it contains transparency data")
continue

# Scale down large images
if img.image.width > image_maxwidth or img.image.height > image_maxheight:
self.logger.debug(f" -> Resizing image {img_idx} on page {page.page_number} from {img.image.width}x{img.image.height} px to fit into {image_maxwidth}x{image_maxheight} px")
img.image.thumbnail(size=(image_maxwidth, image_maxheight), resample=Resampling.LANCZOS)

# Compress images
self.logger.debug(f" -> Replacing image {img_idx} on page {page.page_number} with quality {image_quality}")
img.replace(
img.image,
quality=image_quality,
optimize=True,
progressive=False
)

self.logger.debug(f" -> Compressing PDF content streams on page {page.page_number} with level {pdf_compression_level}")
page.compress_content_streams(level=pdf_compression_level)

with open(file, "wb") as f:
writer.write(f)
new_filesize = os.path.getsize(file)
size_percent = round((new_filesize / old_filesize) * 100, 2)
self.logger.debug(f" -> Saved compressed PDF as: {file} (size: {os.path.getsize(file)} bytes, {size_percent}% of original)")

async def _process_quiz_attempts_metadata(self) -> None:
"""
Fetches metadata for all quiz attempts that should be archived and writes it to a CSV file
Expand Down
2 changes: 1 addition & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class Config:
REPORT_WAIT_FOR_NAVIGATION_TIMEOUT_SEC = int(os.getenv('QUIZ_ARCHIVER_WAIT_FOR_NAVIGATION_TIMEOUT_SEC', default=30))
"""Number of seconds to wait for the report page to load before aborting the job"""

REPORT_PREVENT_REDIRECT_TO_LOGIN = bool(os.getenv('QUIZ_ARCHIVER_PREVENT_REDIRECT_TO_LOGIN', default=True))
PREVENT_REDIRECT_TO_LOGIN = bool(os.getenv('QUIZ_ARCHIVER_PREVENT_REDIRECT_TO_LOGIN', default=True))
"""Whether to supress all redirects to Moodle login pages (`/login/*.php`) after page load. This can occur, if dynamic ajax requests due to with permission errors."""

MOODLE_WSFUNCTION_ARCHIVE = 'quiz_archiver_generate_attempt_report'
Expand Down
Loading