ngandrass · ngandrass · Jul 18, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## Version 1.5.0 (XXXX-XX-XX)
+
+- Optionally scale down large images within quiz reports to preserve space and keep PDF files compact
+- Optionally compress images within quiz reports to preserve space and keep PDF files compact
+- Rename `REPORT_PREVENT_REDIRECT_TO_LOGIN` to `PREVENT_REDIRECT_TO_LOGIN` to reflect the naming of the environment variable
+- Reduce noise from 3rd party library loggers on log level `DEBUG`
+
+
 ## Version 1.4.0 (2024-07-08)
 
 - Prevent belated redirects away from attempt report page (e.g. to login page)

diff --git a/archiveworker/custom_types.py b/archiveworker/custom_types.py
@@ -72,7 +72,7 @@ class JobArchiveRequest:
     Deserialized JSON request for creating an archive job
     """
 
-    API_VERSION = 5
+    API_VERSION = 6
 
     PAPER_FORMATS = ['A0', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'Letter', 'Legal', 'Tabloid', 'Ledger']
 
@@ -173,6 +173,15 @@ def _validate_self(self):
                 return False
             if not isinstance(self.tasks['archive_quiz_attempts']['filename_pattern'], str) or self.tasks['archive_quiz_attempts']['filename_pattern'] is None:
                 return False
+            if not isinstance(self.tasks['archive_quiz_attempts']['image_optimize'], object) and not self.tasks['archive_quiz_attempts']['image_optimize'] is False:
+                return False
+            if isinstance(self.tasks['archive_quiz_attempts']['image_optimize'], object) and self.tasks['archive_quiz_attempts']['image_optimize'] is not False:
+                if not isinstance(self.tasks['archive_quiz_attempts']['image_optimize']['width'], int) or self.tasks['archive_quiz_attempts']['image_optimize']['width'] < 1:
+                    return False
+                if not isinstance(self.tasks['archive_quiz_attempts']['image_optimize']['height'], int) or self.tasks['archive_quiz_attempts']['image_optimize']['height'] < 1:
+                    return False
+                if not isinstance(self.tasks['archive_quiz_attempts']['image_optimize']['quality'], int) or not 0 <= self.tasks['archive_quiz_attempts']['image_optimize']['quality'] <= 100:
+                    return False
 
         if self.tasks['archive_moodle_backups']:
             if not isinstance(self.tasks['archive_moodle_backups'], List):

diff --git a/archiveworker/moodle_quiz_archive_worker.py b/archiveworker/moodle_quiz_archive_worker.py
@@ -187,5 +187,9 @@ def run() -> None:
     logging.basicConfig(encoding='utf-8', format='[%(asctime)s] | %(levelname)-8s | %(name)s | %(message)s', level=Config.LOG_LEVEL)
     app.logger.info(f'Running {Config.APP_NAME} version {Config.VERSION} on log level {logging.getLevelName(Config.LOG_LEVEL)}')
 
+    # Reduce noise from 3rd party library loggers
+    if Config.LOG_LEVEL == logging.DEBUG:
+        logging.getLogger("PIL").setLevel('INFO')
+
     start_processing_thread()
     waitress.serve(app, host=Config.SERVER_HOST, port=Config.SERVER_PORT)
diff --git a/archiveworker/quiz_archive_job.py b/archiveworker/quiz_archive_job.py
@@ -29,7 +29,9 @@
 from uuid import UUID
 
 import requests
+from PIL.Image import Resampling
 from playwright.async_api import async_playwright, ViewportSize, BrowserContext, Route
+from pypdf import PdfWriter
 
 from config import Config
 from .custom_types import JobStatus, JobArchiveRequest, ReportSignal, BackupStatus
@@ -203,6 +205,14 @@ async def _process_quiz_attempts(self, attemptids: List[int], paper_format: str)
                     raise InterruptedError('Thread stop requested')
                 else:
                     await self._render_quiz_attempt(context, attemptid, paper_format)
+                    if self.request.tasks['archive_quiz_attempts']['image_optimize']:
+                        await self._compress_pdf(
+                            file=Path(f"{self.workdir}/attempts/{self.archived_attempts[attemptid]}/{self.archived_attempts[attemptid]}.pdf"),
+                            pdf_compression_level=6,
+                            image_maxwidth=self.request.tasks['archive_quiz_attempts']['image_optimize']['width'],
+                            image_maxheight=self.request.tasks['archive_quiz_attempts']['image_optimize']['height'],
+                            image_quality=self.request.tasks['archive_quiz_attempts']['image_optimize']['quality']
+                        )
 
             await browser.close()
             self.logger.debug("Destroyed playwright Browser and BrowserContext")
@@ -295,7 +305,7 @@ async def javascript_redirection_patcher(route: Route):
         try:
             # Register custom route handlers
             await page.route(f"{self.request.moodle_base_url}/mock/attempt", mock_responder)
-            if Config.REPORT_PREVENT_REDIRECT_TO_LOGIN:
+            if Config.PREVENT_REDIRECT_TO_LOGIN:
                 await page.route('**/login/*.php', login_redirection_interceptor)
                 await page.route('**/*.js', javascript_redirection_patcher)
 
@@ -398,6 +408,71 @@ async def _wait_for_page_ready_signal(self, page) -> None:
             cmsg = await cmsg_handler.value
             self.logger.debug(f'Received signal: {cmsg}')
 
+    async def _compress_pdf(
+            self,
+            file: Path,
+            pdf_compression_level: int,
+            image_maxwidth: int,
+            image_maxheight: int,
+            image_quality: int
+    ) -> None:
+        """
+        Compresses a PDF file by resizing/compressing images and compressing content streams.
+        Replaces the given file.
+
+        :param file: Path to the PDF file to compress
+        :param pdf_compression_level: Compression level for content streams (0-9)
+        :param image_maxwidth: Maximum width of images in pixels
+        :param image_maxheight: Maximum height of images in pixels
+        :param image_quality: JPEG2000 compression quality (0-100)
+        :return: None
+        """
+
+        # Dev notes:
+        # (1) Page content stream compression did not much in our tests, but it's basically free, so we keep it without
+        # making it configurable to the user for now.
+        # (2) Re-writing the whole file after compression, as suggested by pypdf, does change nothing for us, since it
+        # is already re-written during the image processing step.
+        # (3) By far the greatest size reduction is achieved scaling down huge images, if people upload high-res images.
+
+        old_filesize = os.path.getsize(file)
+        self.logger.debug(f"Compressing PDF file: {file} (size: {old_filesize} bytes)")
+        writer = PdfWriter(clone_from=file)
+
+        img_idx = 0
+        for page in writer.pages:
+            for img in page.images:
+                img_idx += 1
+
+                # Do not touch images with transparency data (mode=RGBA).
+                # See: https://github.com/python-pillow/Pillow/issues/8074
+                if img.image.has_transparency_data:
+                    self.logger.debug(f"  -> Skipping image {img_idx} on page {page.page_number} because it contains transparency data")
+                    continue
+
+                # Scale down large images
+                if img.image.width > image_maxwidth or img.image.height > image_maxheight:
+                    self.logger.debug(f"  -> Resizing image {img_idx} on page {page.page_number} from {img.image.width}x{img.image.height} px to fit into {image_maxwidth}x{image_maxheight} px")
+                    img.image.thumbnail(size=(image_maxwidth, image_maxheight), resample=Resampling.LANCZOS)
+
+                # Compress images
+                self.logger.debug(f"  -> Replacing image {img_idx} on page {page.page_number} with quality {image_quality}")
+                img.replace(
+                    img.image,
+                    quality=image_quality,
+                    optimize=True,
+                    progressive=False
+                )
+
+            self.logger.debug(f" -> Compressing PDF content streams on page {page.page_number} with level {pdf_compression_level}")
+            page.compress_content_streams(level=pdf_compression_level)
+
+        with open(file, "wb") as f:
+            writer.write(f)
+            new_filesize = os.path.getsize(file)
+            size_percent = round((new_filesize / old_filesize) * 100, 2)
+            self.logger.debug(f"  -> Saved compressed PDF as: {file} (size: {os.path.getsize(file)} bytes, {size_percent}% of original)")
+
     async def _process_quiz_attempts_metadata(self) -> None:
         """
         Fetches metadata for all quiz attempts that should be archived and writes it to a CSV file

diff --git a/config.py b/config.py
@@ -77,7 +77,7 @@ class Config:
     REPORT_WAIT_FOR_NAVIGATION_TIMEOUT_SEC = int(os.getenv('QUIZ_ARCHIVER_WAIT_FOR_NAVIGATION_TIMEOUT_SEC', default=30))
     """Number of seconds to wait for the report page to load before aborting the job"""
 
-    REPORT_PREVENT_REDIRECT_TO_LOGIN = bool(os.getenv('QUIZ_ARCHIVER_PREVENT_REDIRECT_TO_LOGIN', default=True))
+    PREVENT_REDIRECT_TO_LOGIN = bool(os.getenv('QUIZ_ARCHIVER_PREVENT_REDIRECT_TO_LOGIN', default=True))
     """Whether to supress all redirects to Moodle login pages (`/login/*.php`) after page load. This can occur, if dynamic ajax requests due to with permission errors."""
 
     MOODLE_WSFUNCTION_ARCHIVE = 'quiz_archiver_generate_attempt_report'