Revert "Remove PdfDocument._rendering_input and related features"

This reverts commit f1f510c. Wait with the removal until the major release.
pypdfium2-team · Oct 15, 2022 · 9998d5a · 9998d5a
1 parent d92ba57
commit 9998d5a
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 25 deletions.
diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
@@ -4,8 +4,5 @@
 <!-- List character: dash (-) -->
 
 # Changelog for next release
-- Disruption: Two components of `PdfDocument` have been removed to clean up the code (without a major release, due to their insignificance):
-  - Removal of `update_rendering_input()`. Callers are expected to save and re-open the document on their if they wish that changes take effect with the multi-page renderer.
-  - The multipage renderer does not implicitly read byte buffers into memory anymore. Callers are expected to take an explicit decision by providing a different input in the first place.
 - Added a new support model `PdfImageObject` (which inherits from `PdfPageObject`). This can be used to insert a JPEG image into a page, get metadata, etc.
 - Docs: The changelog page now selectively includes an entry for the next release that may be shown on `latest` builds.
diff --git a/docs/devel/tasks.md b/docs/devel/tasks.md
@@ -17,6 +17,7 @@ Also see the issues panel and inline `TODO` marks in source code.
 * Ensure we correctly handle PDFium return codes indicating failure.
 * Review on a case-by-case basis where we should raise an error and where pass.
 * Investigate if we can implement interruptible rendering.
+* When rendering with multiple processes and bytes were provided as input, is the memory duplicated or shared? If it's duplicated, find a way to share it or write a tempfile instead.
 * Move init/destroy into a separate file. Provide public init/destroy functions, given that embedders who deal with long-running applications might not want to have PDFium in memory all the time.
 * Make the bindings file `_pypdfium.py` public ?
 

diff --git a/docs/source/planned_changes.md b/docs/source/planned_changes.md
@@ -10,5 +10,10 @@ The following API breaking changes are being considered for the next major relea
   * The `count_chars()` alias will be removed in favour of the `n_chars` attribute.
   * The `get_text()` alias will be removed in favour of `get_text_bounded()`.
 - `PdfPage.insert_text()` will be renamed to `insert_text_shaped()`.
-- The `PdfDocument` context manager API will be removed. It will not be possible to use documents in a `with`-block anymore.
+- The `PdfDocument` class will be cleaned up:
+  * The context manager API will be removed. It will not be possible to use documents in a `with`-block anymore.
+  * The `update_rendering_input()` method will be removed.
+    Callers are expected to save and re-open the document on their if they wish that changes take effect with the multi-page renderer.
+  * The multipage renderer will not implicitly read byte buffers into memory anymore.
+    Callers are expected to take an explicit decision by providing a different input in the first place.
 - `PdfDocument.add_font()` might be changed to take bytes rather than a file path.
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2022 geisserml <[email protected]>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
+import io
 import os
 import os.path
 import weakref
@@ -76,6 +77,7 @@ def __init__(
         self._actual_input = input_data
         self._data_holder = []
         self._data_closer = []
+        self._rendering_input = None
 
         self._password = password
         self._file_access = file_access
@@ -499,6 +501,19 @@ def print_toc(toc, n_digits=2):
             )
 
 
+    def update_rendering_input(self):
+        """
+        Update the input sources for concurrent rendering to the document's current state
+        by saving to bytes and setting the result as new input.
+        If you modified the document, you may want to call this method before :meth:`.render_to`.
+        """
+        buffer = io.BytesIO()
+        self.save(buffer)
+        buffer.seek(0)
+        self._rendering_input = buffer.read()
+        buffer.close()
+
+
     @classmethod
     def _process_page(cls, index, converter, input_data, password, file_access, **kwargs):
         pdf = cls(
@@ -519,7 +534,7 @@ def render_to(
             **kwargs
         ):
         """
-        Render multiple pages in parallel, using a process pool executor.
+        Concurrently render multiple pages, using a process pool executor.
         
         If rendering only a single page, the call is simply forwarded to :meth:`.PdfPage.render_to` as a shortcut.
         
@@ -552,15 +567,23 @@ def render_to(
             yield result
             return
 
-        if isinstance(self._orig_input, pdfium.FPDF_DOCUMENT):
-            raise ValueError("Cannot render in parallel without input sources.")
-        elif is_input_buffer(self._orig_input):
-            raise ValueError("Cannot render in parallel with buffer input.")
+        if self._rendering_input is None:
+            if isinstance(self._orig_input, pdfium.FPDF_DOCUMENT):
+                logger.warning("Cannot perform concurrent processing without input sources - saving the document implicitly to get picklable data.")
+                self.update_rendering_input()
+            elif is_input_buffer(self._orig_input):
+                logger.warning("Cannot perform concurrent rendering with buffer input - reading the whole buffer into memory implicitly.")
+                cursor = self._orig_input.tell()
+                self._orig_input.seek(0)
+                self._rendering_input = self._orig_input.read()
+                self._orig_input.seek(cursor)
+            else:
+                self._rendering_input = self._orig_input
 
         invoke_renderer = functools.partial(
             PdfDocument._process_page,
             converter = converter,
-            input_data = self._orig_input,
+            input_data = self._rendering_input,
             password = self._password,
             file_access = self._file_access,
             **kwargs

diff --git a/tests/helpers/test_opener.py b/tests/helpers/test_opener.py
@@ -181,7 +181,7 @@ def test_open_new():
     assert dest_pdf.raw is dest_pdf._orig_input is dest_pdf._actual_input
     assert dest_pdf._data_holder == []
     assert dest_pdf._data_closer == []
-
+    assert dest_pdf._rendering_input is None
     assert dest_pdf.get_version() is None
 
     src_pdf = pdfium.PdfDocument(TestFiles.multipage)

diff --git a/tests/helpers/test_renderer.py b/tests/helpers/test_renderer.py
@@ -2,10 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
 import io
-import re
 import math
 import ctypes
 import weakref
+import logging
 from os.path import join
 import numpy
 import PIL.Image
@@ -377,32 +377,44 @@ def test_render_pdffile(render_pdffile_topil, render_pdffile_tobytes, render_pdf
         assert a == b == c
 
 
-def test_render_pdf_new():
+def test_render_pdf_new(caplog):
 
-    # two pages to actually reach the process pool and not just the single-page shortcut
     pdf = pdfium.PdfDocument.new()
+    # two pages to actually reach the process pool and not just the single-page shortcut
     page_1 = pdf.new_page(50, 100)
     page_2 = pdf.new_page(50, 100)
-    renderer = pdf.render_to(
-        pdfium.BitmapConv.pil_image,
-    )
 
-    with pytest.raises(ValueError, match="Cannot render in parallel without input sources."):
+    with caplog.at_level(logging.WARNING):
+        renderer = pdf.render_to(pdfium.BitmapConv.pil_image)
         image = next(renderer)
+
+    warning = "Cannot perform concurrent processing without input sources - saving the document implicitly to get picklable data."
+    assert warning in caplog.text
+
+    assert isinstance(image, PIL.Image.Image)
+    assert image.mode == "RGB"
+    assert image.size == (50, 100)
+
 
-
-def test_render_pdfbuffer():
+def test_render_pdfbuffer(caplog):
 
     buffer = open(TestFiles.multipage, "rb")
     pdf = pdfium.PdfDocument(buffer)
     assert pdf._orig_input is buffer
     assert pdf._actual_input is buffer
+    assert pdf._rendering_input is None
 
-    renderer = pdf.render_to(
-        pdfium.BitmapConv.pil_image,
-    )
-    with pytest.raises(ValueError, match=re.escape("Cannot render in parallel with buffer input.")):
-        next(renderer)
+    with caplog.at_level(logging.WARNING):
+        renderer = pdf.render_to(
+            pdfium.BitmapConv.pil_image,
+            scale = 0.5,
+        )
+        image = next(renderer)
+        assert isinstance(image, PIL.Image.Image)
+
+    assert isinstance(pdf._rendering_input, bytes)
+    warning = "Cannot perform concurrent rendering with buffer input - reading the whole buffer into memory implicitly."
+    assert warning in caplog.text
 
 
 def test_render_pdfbytes():
@@ -413,12 +425,14 @@ def test_render_pdfbytes():
     pdf = pdfium.PdfDocument(data)
     assert pdf._orig_input is data
     assert pdf._actual_input is data
+    assert pdf._rendering_input is None
     renderer = pdf.render_to(
         pdfium.BitmapConv.pil_image,
         scale = 0.5,
     )
     image = next(renderer)
     assert isinstance(image, PIL.Image.Image)
+    assert isinstance(pdf._rendering_input, bytes)
 
 
 def test_render_pdffile_asbuffer():
@@ -427,6 +441,7 @@ def test_render_pdffile_asbuffer():
 
     assert pdf._orig_input == TestFiles.multipage
     assert isinstance(pdf._actual_input, io.BufferedReader)
+    assert pdf._rendering_input is None
     assert pdf._file_access is pdfium.FileAccess.BUFFER
 
     renderer = pdf.render_to(
@@ -436,6 +451,8 @@ def test_render_pdffile_asbuffer():
     image = next(renderer)
     assert isinstance(image, PIL.Image.Image)
 
+    assert pdf._rendering_input == TestFiles.multipage
+
     pdf.close()
     assert pdf._actual_input.closed is True
 
@@ -446,6 +463,7 @@ def test_render_pdffile_asbytes():
 
     assert pdf._orig_input == TestFiles.multipage
     assert isinstance(pdf._actual_input, bytes)
+    assert pdf._rendering_input is None
     assert pdf._file_access is pdfium.FileAccess.BYTES
 
     renderer = pdf.render_to(
@@ -454,6 +472,7 @@ def test_render_pdffile_asbytes():
     )
     image = next(renderer)
     assert isinstance(image, PIL.Image.Image)
+    assert pdf._rendering_input == TestFiles.multipage
 
 
 @pytest.mark.parametrize(