Skip to content

Commit

Permalink
Revert "Remove PdfDocument._rendering_input and related features"
Browse files Browse the repository at this point in the history
This reverts commit f1f510c.

Wait with the removal until the major release.
  • Loading branch information
mara004 committed Oct 15, 2022
1 parent d92ba57 commit 9998d5a
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 25 deletions.
3 changes: 0 additions & 3 deletions docs/devel/changelog_staging.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,5 @@
<!-- List character: dash (-) -->

# Changelog for next release
- Disruption: Two components of `PdfDocument` have been removed to clean up the code (without a major release, due to their insignificance):
- Removal of `update_rendering_input()`. Callers are expected to save and re-open the document on their if they wish that changes take effect with the multi-page renderer.
- The multipage renderer does not implicitly read byte buffers into memory anymore. Callers are expected to take an explicit decision by providing a different input in the first place.
- Added a new support model `PdfImageObject` (which inherits from `PdfPageObject`). This can be used to insert a JPEG image into a page, get metadata, etc.
- Docs: The changelog page now selectively includes an entry for the next release that may be shown on `latest` builds.
1 change: 1 addition & 0 deletions docs/devel/tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Also see the issues panel and inline `TODO` marks in source code.
* Ensure we correctly handle PDFium return codes indicating failure.
* Review on a case-by-case basis where we should raise an error and where pass.
* Investigate if we can implement interruptible rendering.
* When rendering with multiple processes and bytes were provided as input, is the memory duplicated or shared? If it's duplicated, find a way to share it or write a tempfile instead.
* Move init/destroy into a separate file. Provide public init/destroy functions, given that embedders who deal with long-running applications might not want to have PDFium in memory all the time.
* Make the bindings file `_pypdfium.py` public ?

Expand Down
7 changes: 6 additions & 1 deletion docs/source/planned_changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,10 @@ The following API breaking changes are being considered for the next major relea
* The `count_chars()` alias will be removed in favour of the `n_chars` attribute.
* The `get_text()` alias will be removed in favour of `get_text_bounded()`.
- `PdfPage.insert_text()` will be renamed to `insert_text_shaped()`.
- The `PdfDocument` context manager API will be removed. It will not be possible to use documents in a `with`-block anymore.
- The `PdfDocument` class will be cleaned up:
* The context manager API will be removed. It will not be possible to use documents in a `with`-block anymore.
* The `update_rendering_input()` method will be removed.
Callers are expected to save and re-open the document on their if they wish that changes take effect with the multi-page renderer.
* The multipage renderer will not implicitly read byte buffers into memory anymore.
Callers are expected to take an explicit decision by providing a different input in the first place.
- `PdfDocument.add_font()` might be changed to take bytes rather than a file path.
35 changes: 29 additions & 6 deletions src/pypdfium2/_helpers/document.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-FileCopyrightText: 2022 geisserml <[email protected]>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause

import io
import os
import os.path
import weakref
Expand Down Expand Up @@ -76,6 +77,7 @@ def __init__(
self._actual_input = input_data
self._data_holder = []
self._data_closer = []
self._rendering_input = None

self._password = password
self._file_access = file_access
Expand Down Expand Up @@ -499,6 +501,19 @@ def print_toc(toc, n_digits=2):
)


def update_rendering_input(self):
"""
Update the input sources for concurrent rendering to the document's current state
by saving to bytes and setting the result as new input.
If you modified the document, you may want to call this method before :meth:`.render_to`.
"""
buffer = io.BytesIO()
self.save(buffer)
buffer.seek(0)
self._rendering_input = buffer.read()
buffer.close()


@classmethod
def _process_page(cls, index, converter, input_data, password, file_access, **kwargs):
pdf = cls(
Expand All @@ -519,7 +534,7 @@ def render_to(
**kwargs
):
"""
Render multiple pages in parallel, using a process pool executor.
Concurrently render multiple pages, using a process pool executor.
If rendering only a single page, the call is simply forwarded to :meth:`.PdfPage.render_to` as a shortcut.
Expand Down Expand Up @@ -552,15 +567,23 @@ def render_to(
yield result
return

if isinstance(self._orig_input, pdfium.FPDF_DOCUMENT):
raise ValueError("Cannot render in parallel without input sources.")
elif is_input_buffer(self._orig_input):
raise ValueError("Cannot render in parallel with buffer input.")
if self._rendering_input is None:
if isinstance(self._orig_input, pdfium.FPDF_DOCUMENT):
logger.warning("Cannot perform concurrent processing without input sources - saving the document implicitly to get picklable data.")
self.update_rendering_input()
elif is_input_buffer(self._orig_input):
logger.warning("Cannot perform concurrent rendering with buffer input - reading the whole buffer into memory implicitly.")
cursor = self._orig_input.tell()
self._orig_input.seek(0)
self._rendering_input = self._orig_input.read()
self._orig_input.seek(cursor)
else:
self._rendering_input = self._orig_input

invoke_renderer = functools.partial(
PdfDocument._process_page,
converter = converter,
input_data = self._orig_input,
input_data = self._rendering_input,
password = self._password,
file_access = self._file_access,
**kwargs
Expand Down
2 changes: 1 addition & 1 deletion tests/helpers/test_opener.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def test_open_new():
assert dest_pdf.raw is dest_pdf._orig_input is dest_pdf._actual_input
assert dest_pdf._data_holder == []
assert dest_pdf._data_closer == []

assert dest_pdf._rendering_input is None
assert dest_pdf.get_version() is None

src_pdf = pdfium.PdfDocument(TestFiles.multipage)
Expand Down
47 changes: 33 additions & 14 deletions tests/helpers/test_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause

import io
import re
import math
import ctypes
import weakref
import logging
from os.path import join
import numpy
import PIL.Image
Expand Down Expand Up @@ -377,32 +377,44 @@ def test_render_pdffile(render_pdffile_topil, render_pdffile_tobytes, render_pdf
assert a == b == c


def test_render_pdf_new():
def test_render_pdf_new(caplog):

# two pages to actually reach the process pool and not just the single-page shortcut
pdf = pdfium.PdfDocument.new()
# two pages to actually reach the process pool and not just the single-page shortcut
page_1 = pdf.new_page(50, 100)
page_2 = pdf.new_page(50, 100)
renderer = pdf.render_to(
pdfium.BitmapConv.pil_image,
)

with pytest.raises(ValueError, match="Cannot render in parallel without input sources."):
with caplog.at_level(logging.WARNING):
renderer = pdf.render_to(pdfium.BitmapConv.pil_image)
image = next(renderer)

warning = "Cannot perform concurrent processing without input sources - saving the document implicitly to get picklable data."
assert warning in caplog.text

assert isinstance(image, PIL.Image.Image)
assert image.mode == "RGB"
assert image.size == (50, 100)



def test_render_pdfbuffer():
def test_render_pdfbuffer(caplog):

buffer = open(TestFiles.multipage, "rb")
pdf = pdfium.PdfDocument(buffer)
assert pdf._orig_input is buffer
assert pdf._actual_input is buffer
assert pdf._rendering_input is None

renderer = pdf.render_to(
pdfium.BitmapConv.pil_image,
)
with pytest.raises(ValueError, match=re.escape("Cannot render in parallel with buffer input.")):
next(renderer)
with caplog.at_level(logging.WARNING):
renderer = pdf.render_to(
pdfium.BitmapConv.pil_image,
scale = 0.5,
)
image = next(renderer)
assert isinstance(image, PIL.Image.Image)

assert isinstance(pdf._rendering_input, bytes)
warning = "Cannot perform concurrent rendering with buffer input - reading the whole buffer into memory implicitly."
assert warning in caplog.text


def test_render_pdfbytes():
Expand All @@ -413,12 +425,14 @@ def test_render_pdfbytes():
pdf = pdfium.PdfDocument(data)
assert pdf._orig_input is data
assert pdf._actual_input is data
assert pdf._rendering_input is None
renderer = pdf.render_to(
pdfium.BitmapConv.pil_image,
scale = 0.5,
)
image = next(renderer)
assert isinstance(image, PIL.Image.Image)
assert isinstance(pdf._rendering_input, bytes)


def test_render_pdffile_asbuffer():
Expand All @@ -427,6 +441,7 @@ def test_render_pdffile_asbuffer():

assert pdf._orig_input == TestFiles.multipage
assert isinstance(pdf._actual_input, io.BufferedReader)
assert pdf._rendering_input is None
assert pdf._file_access is pdfium.FileAccess.BUFFER

renderer = pdf.render_to(
Expand All @@ -436,6 +451,8 @@ def test_render_pdffile_asbuffer():
image = next(renderer)
assert isinstance(image, PIL.Image.Image)

assert pdf._rendering_input == TestFiles.multipage

pdf.close()
assert pdf._actual_input.closed is True

Expand All @@ -446,6 +463,7 @@ def test_render_pdffile_asbytes():

assert pdf._orig_input == TestFiles.multipage
assert isinstance(pdf._actual_input, bytes)
assert pdf._rendering_input is None
assert pdf._file_access is pdfium.FileAccess.BYTES

renderer = pdf.render_to(
Expand All @@ -454,6 +472,7 @@ def test_render_pdffile_asbytes():
)
image = next(renderer)
assert isinstance(image, PIL.Image.Image)
assert pdf._rendering_input == TestFiles.multipage


@pytest.mark.parametrize(
Expand Down

0 comments on commit 9998d5a

Please sign in to comment.