From 9856cfce107dced253ba00ad252328cf6361908d Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Apr 2024 15:48:12 +0200 Subject: [PATCH 001/140] Work on API-breaking changes (bookmarks) This backports (and slightly improves) the new bookmark API from devel_new. Test suite TBD. --- src/pypdfium2/_cli/toc.py | 31 ++--- src/pypdfium2/_helpers/document.py | 174 +++++++++++++++-------------- 2 files changed, 111 insertions(+), 94 deletions(-) diff --git a/src/pypdfium2/_cli/toc.py b/src/pypdfium2/_cli/toc.py index 6921c2af8..f05f50d6c 100644 --- a/src/pypdfium2/_cli/toc.py +++ b/src/pypdfium2/_cli/toc.py @@ -25,18 +25,23 @@ def attach(parser): def main(args): pdf = get_input(args) - toc = pdf.get_toc( - max_depth = args.max_depth, - ) + toc = pdf.get_toc(max_depth=args.max_depth) - for item in toc: - state = "*" if item.n_kids == 0 else "-" if item.is_closed else "+" - target = "?" if item.page_index is None else item.page_index+1 - print( - " " * item.level + - "[%s] %s -> %s # %s %s" % ( - state, item.title, target, - pdfium_i.ViewmodeToStr.get(item.view_mode), - round_list(item.view_pos, args.n_digits), - ) + for bm in toc: + count, dest = bm.get_count(), bm.get_dest() + out = " " * bm.level + out += "[%s] %s -> " % ( + "*" if count == 0 else f"{count:+}", + bm.get_title(), ) + # distinguish between "no dest" and "dest with invalid values" while keeping result machine readable + if dest: + index, (view_mode, view_pos) = dest.get_index(), dest.get_view() + out += "%s # %s %s" % ( + index+1 if index != None else "?", + pdfium_i.ViewmodeToStr.get(view_mode), + round_list(view_pos, args.n_digits), + ) + else: + out += "_" + print(out) diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index b12296942..629e4ad45 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2024 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -__all__ = ("PdfDocument", "PdfFormEnv", "PdfXObject", "PdfOutlineItem") +__all__ = ("PdfDocument", "PdfFormEnv", "PdfXObject", "PdfBookmark", "PdfDest") import os import ctypes @@ -183,7 +183,6 @@ def init_forms(self, config=None): ) - # TODO?(v5) consider cached property def get_formtype(self): """ Returns: @@ -193,7 +192,6 @@ def get_formtype(self): return pdfium_c.FPDF_GetFormType(self) - # TODO?(v5) consider cached property def get_pagemode(self): """ Returns: @@ -202,7 +200,6 @@ def get_pagemode(self): return pdfium_c.FPDFDoc_GetPageMode(self) - # TODO?(v5) consider cached property def is_tagged(self): """ Returns: @@ -355,7 +352,6 @@ def del_attachment(self, index): raise PdfiumError(f"Failed to delete attachment at index {index}.") - # TODO deprecate in favour of index access? def get_page(self, index): """ Returns: @@ -398,7 +394,7 @@ def new_page(self, width, height, index=None): index = len(self) raw_page = pdfium_c.FPDFPage_New(self, index, width, height) page = PdfPage(raw_page, self, None) - # not doing formenv calls for new pages as we don't see the point + # not doing formenv calls for new pages self._add_kid(page) return page @@ -406,8 +402,9 @@ def new_page(self, width, height, index=None): def del_page(self, index): """ Remove the page at *index* (zero-based). + It is recommended to close any open handles to the page before deleting it. """ - # FIXME what if the caller still has a handle to the page? + # FIXME not sure how pdfium would behave if the caller tries to access a handle to a deleted page... pdfium_c.FPDFPage_Delete(self, index) @@ -486,42 +483,6 @@ def page_as_xobject(self, index, dest_pdf): return xobject - # TODO(apibreak) consider switching to a wrapper class around the raw bookmark - # (either with getter methods, or possibly cached properties) - def _get_bookmark(self, bookmark, level): - - n_bytes = pdfium_c.FPDFBookmark_GetTitle(bookmark, None, 0) - buffer = ctypes.create_string_buffer(n_bytes) - pdfium_c.FPDFBookmark_GetTitle(bookmark, buffer, n_bytes) - title = buffer.raw[:n_bytes-2].decode('utf-16-le') - - # TODO(apibreak) just expose count as-is rather than using two variables and doing extra work - count = pdfium_c.FPDFBookmark_GetCount(bookmark) - is_closed = True if count < 0 else None if count == 0 else False - n_kids = abs(count) - - dest = pdfium_c.FPDFBookmark_GetDest(self, bookmark) - page_index = pdfium_c.FPDFDest_GetDestPageIndex(self, dest) - if page_index == -1: - page_index = None - - n_params = ctypes.c_ulong() - view_pos = (pdfium_c.FS_FLOAT * 4)() - view_mode = pdfium_c.FPDFDest_GetView(dest, n_params, view_pos) - view_pos = list(view_pos)[:n_params.value] - - return PdfOutlineItem( - level = level, - title = title, - is_closed = is_closed, - n_kids = n_kids, - page_index = page_index, - view_mode = view_mode, - view_pos = view_pos, - ) - - - # TODO(apibreak) change outline API (see above) def get_toc( self, max_depth = 15, @@ -530,39 +491,37 @@ def get_toc( seen = None, ): """ - Iterate through the bookmarks in the document's table of contents. + Iterate through the bookmarks in the document's table of contents (TOC). Parameters: max_depth (int): Maximum recursion depth to consider. Yields: - :class:`.PdfOutlineItem`: Bookmark information. + :class:`.PdfBookmark` """ if seen is None: seen = set() - bookmark = pdfium_c.FPDFBookmark_GetFirstChild(self, parent) + bm_ptr = pdfium_c.FPDFBookmark_GetFirstChild(self, parent) - while bookmark: + # NOTE We need bool(ptr) here to handle cases where .contents is a null pointer (raises exception on access). Don't use ptr != None, it's always true. + while bm_ptr: - address = ctypes.addressof(bookmark.contents) + address = ctypes.addressof(bm_ptr.contents) if address in seen: - logger.warning("A circular bookmark reference was detected whilst parsing the table of contents.") + logger.warning("A circular bookmark reference was detected while traversing the table of contents.") break else: seen.add(address) - yield self._get_bookmark(bookmark, level) + yield PdfBookmark(bm_ptr, self, level) if level < max_depth-1: - yield from self.get_toc( - max_depth = max_depth, - parent = bookmark, - level = level + 1, - seen = seen, - ) + yield from self.get_toc(max_depth=max_depth, parent=bm_ptr, level=level+1, seen=seen) + elif pdfium_c.FPDFBookmark_GetFirstChild(self, bm_ptr): + logger.warning(f"Maximum recursion depth {max_depth} reached. Children beyond this scope are ignored.") - bookmark = pdfium_c.FPDFBookmark_GetNextSibling(self, bookmark) + bm_ptr = pdfium_c.FPDFBookmark_GetNextSibling(self, bm_ptr) def render( @@ -681,28 +640,81 @@ def _open_pdf(input_data, password, autoclose): return pdf, to_hold, to_close -# TODO(apibreak) change outline API (see above) -PdfOutlineItem = namedtuple("PdfOutlineItem", "level title is_closed n_kids page_index view_mode view_pos") -""" -Bookmark information. +class PdfBookmark (pdfium_i.AutoCastable): + """ + Bookmark helper class. + + Attributes: + raw (FPDF_BOOKMARK): + The underlying PDFium bookmark handle. + pdf (PdfDocument): + Reference to the document this bookmark belongs to. + level (int): + The bookmark's nesting level in the TOC tree. Corresponds to the number of parent bookmarks. + """ + + def __init__(self, raw, pdf, level): + self.raw, self.pdf, self.level = raw, pdf, level + + def get_title(self): + """ + Returns: + str: The bookmark's title string. + """ + n_bytes = pdfium_c.FPDFBookmark_GetTitle(self, None, 0) + buffer = ctypes.create_string_buffer(n_bytes) + pdfium_c.FPDFBookmark_GetTitle(self, buffer, n_bytes) + return buffer.raw[:n_bytes-2].decode("utf-16-le") + + def get_count(self): + """ + Returns: + int: Signed number of child bookmarks (fully recursive). Zero if the bookmark has no descendants. + The initial state shall be closed (collapsed) if negative, open (expanded) if positive. + """ + return pdfium_c.FPDFBookmark_GetCount(self) + + def get_dest(self): + """ + Returns: + PdfDest | None: The bookmark's destination (page index, viewport), or None on failure. + """ + raw_dest = pdfium_c.FPDFBookmark_GetDest(self.pdf, self) + if not raw_dest: + return None + return PdfDest(raw_dest, pdf=self.pdf) + -Parameters: - level (int): - Number of parent items. - title (str): - Title string of the bookmark. - is_closed (bool): - True if child items shall be collapsed, False if they shall be expanded. - None if the item has no descendants (i. e. ``n_kids == 0``). - n_kids (int): - Absolute number of child items, according to the PDF. - page_index (int | None): - Zero-based index of the page the bookmark points to. - May be None if the bookmark has no target page (or it could not be determined). - view_mode (int): - A view mode constant (:data:`PDFDEST_VIEW_*`) defining how the coordinates of *view_pos* shall be interpreted. - view_pos (list[float]): - Target position on the page the viewport should jump to when the bookmark is clicked. - It is a sequence of :class:`float` values in PDF canvas units. - Depending on *view_mode*, it may contain between 0 and 4 coordinates. -""" +class PdfDest (pdfium_i.AutoCastable): + """ + Destination helper class. + + Attributes: + raw (FPDF_DEST): The underlying PDFium destination handle. + pdf (PdfDocument): Reference to the document this dest belongs to. + """ + + def __init__(self, raw, pdf): + self.raw, self.pdf = raw, pdf + + def get_index(self): + """ + Returns: + int | None: Zero-based index of the page the dest points to, or None on failure. + """ + val = pdfium_c.FPDFDest_GetDestPageIndex(self.pdf, self) + return val if val >= 0 else None + + def get_view(self): + """ + Returns: + (int, list[float]): A tuple of (view_mode, view_pos). + *view_mode* is a constant (one of :data:`PDFDEST_VIEW_*`) defining how *view_pos* shall be interpreted. + *view_pos* is the target position on the page the dest points to. + It may contain between 0 to 4 float coordinates, depending on the view mode. + """ + n_params = ctypes.c_ulong() + pos = (pdfium_c.FS_FLOAT * 4)() + mode = pdfium_c.FPDFDest_GetView(self, n_params, pos) + pos = list(pos)[:n_params.value] + return (mode, pos) From 0183d80a4cdc03356dae34c17ab8834d3c227d0c Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Apr 2024 17:21:44 +0200 Subject: [PATCH 002/140] toc: update API test --- tests/test_toc.py | 48 ++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/tests/test_toc.py b/tests/test_toc.py index a1b54a1db..b8e0869a5 100644 --- a/tests/test_toc.py +++ b/tests/test_toc.py @@ -8,10 +8,19 @@ from .conftest import TestResources -def _compare_bookmark(bookmark, view_pos, **kwargs): - for name, exp_value in kwargs.items(): - assert exp_value == getattr(bookmark, name) - assert pytest.approx(bookmark.view_pos, abs=1) == view_pos +def _compare_bookmark(bm, **kwargs): + assert isinstance(bm, pdfium.PdfBookmark) + assert kwargs["title"] == bm.get_title() + assert kwargs["count"] == bm.get_count() + dest = bm.get_dest() + if dest is None: + assert kwargs["dest"] is None + else: + assert isinstance(dest, pdfium.PdfDest) + assert kwargs["page_index"] == dest.get_index() + view_mode, view_pos = dest.get_view() + assert kwargs["view_mode"] == view_mode + assert kwargs["view_pos"] == pytest.approx(view_pos, abs=1) def test_gettoc(): @@ -26,25 +35,24 @@ def test_gettoc(): page_index = 0, view_mode = pdfium_c.PDFDEST_VIEW_XYZ, view_pos = (89, 758, 0), - is_closed = True, - n_kids = 2, + count = -2, ) # check common values - for bookmark in toc: - assert isinstance(bookmark, pdfium.PdfOutlineItem) - assert bookmark.view_mode is pdfium_c.PDFDEST_VIEW_XYZ - assert round(bookmark.view_pos[0]) == 89 + for bm in toc: + dest = bm.get_dest() + view_mode, view_pos = dest.get_view() + assert view_mode is pdfium_c.PDFDEST_VIEW_XYZ + assert round(view_pos[0]) == 89 # check last bookmark _compare_bookmark( - bookmark, + bm, title = "Three-B", page_index = 1, view_mode = pdfium_c.PDFDEST_VIEW_XYZ, view_pos = (89, 657, 0), - is_closed = None, - n_kids = 0, + count = 0, ) @@ -56,20 +64,14 @@ def test_gettoc_circular(caplog): _compare_bookmark( next(toc), title = "A Good Beginning", - page_index = None, - view_mode = pdfium_c.PDFDEST_VIEW_UNKNOWN_MODE, - view_pos = [], - is_closed = None, - n_kids = 0, + dest = None, + count = 0, ) _compare_bookmark( next(toc), title = "A Good Ending", - page_index = None, - view_mode = pdfium_c.PDFDEST_VIEW_UNKNOWN_MODE, - view_pos = [], - is_closed = None, - n_kids = 0, + dest = None, + count = 0, ) with caplog.at_level(logging.WARNING): for other in toc: pass From d699a6ced30fdc9e866f7e9613a81882e507da4e Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Apr 2024 17:40:33 +0200 Subject: [PATCH 003/140] test_cli: also capture stderr/logging Note the following test script: ``` import io import sys import logging import contextlib logger = logging.getLogger("testLogger") logger.setLevel(logging.DEBUG) buf = io.StringIO() logger.addHandler(logging.StreamHandler(buf)) # ! with contextlib.redirect_stdout(buf), contextlib.redirect_stderr(buf): print("print to stdout") print("print to stderr", file=sys.stderr) logger.info("info message") logger.warning("warning message") print(f"{buf.getvalue()!r}") ``` Like this, we get: > 'print to stdout\nprint to stderr\ninfo message\nwarning message\n' Without handler: > 'print to stdout\nprint to stderr\nwarning message\n' With default handler: > info message > warning message > 'print to stdout\nprint to stderr\n' Weird. --- tests/test_cli.py | 50 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 2b64dfd93..4c05bf5d1 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause import io +import logging import filecmp import contextlib from pathlib import Path @@ -11,28 +12,57 @@ import pypdfium2.__main__ as pdfium_cli from .conftest import TestResources, TestExpectations +lib_logger = logging.getLogger("pypdfium2") -def run_cli(argv, exp_stdout=None, normalize_lfs=False): +@contextlib.contextmanager +def logging_capture_handler(buf): + orig_handlers = lib_logger.handlers + lib_logger.handlers = [] + handler = logging.StreamHandler(buf) + lib_logger.addHandler(handler) + yield + lib_logger.removeHandler(handler) + lib_logger.handlers = orig_handlers + + +@contextlib.contextmanager +def joined_ctx(ctxes): + with contextlib.ExitStack() as stack: + for ctx in ctxes: stack.enter_context(ctx) + yield + + +def run_cli(argv, exp_output=None, capture=("out", "err", "log"), normalize_lfs=False): argv = [str(a) for a in argv] - if exp_stdout is None: + if exp_output is None: pdfium_cli.api_main(argv) else: - stdout_buf = io.StringIO() - with contextlib.redirect_stdout(stdout_buf): + output = io.StringIO() + ctxes = [] + assert isinstance(capture, (tuple, list)) + if "out" in capture: + ctxes += [contextlib.redirect_stdout(output)] + if "err" in capture: + ctxes += [contextlib.redirect_stderr(output)] + # for some reason, logging doesn't seem to go the usual stdout/stderr path, so explicitly install a stream handler to capture + if "log" in capture: + ctxes += [logging_capture_handler(output)] + assert len(ctxes) >= 1 + with joined_ctx(ctxes): pdfium_cli.api_main(argv) - if isinstance(exp_stdout, Path): - exp_stdout = exp_stdout.read_text() + if isinstance(exp_output, Path): + exp_output = exp_output.read_text() - stdout = stdout_buf.getvalue() + output = output.getvalue() if normalize_lfs: - stdout = stdout.replace("\r\n", "\n") + output = output.replace("\r\n", "\n") - assert stdout == exp_stdout + assert output == exp_output def _get_files(dir): @@ -57,7 +87,7 @@ def test_attachments(tmp_path): edited_pdf = tmp_path / "edited.pdf" run_cli(["attachments", TestResources.attachments, "edit", "--del-numbers", "1,2", "--add-files", TestResources.mona_lisa, "-o", edited_pdf]) - run_cli(["attachments", edited_pdf, "list"], "[1] mona_lisa.jpg\n") + run_cli(["attachments", edited_pdf, "list"], "[1] mona_lisa.jpg\n", capture=["out"]) def test_images(tmp_path): From 8d0d36fcd20e491d40835d40a631a714c8b59ff7 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Apr 2024 17:40:59 +0200 Subject: [PATCH 004/140] Update test expectations --- tests/expectations/attachments_list.txt | 1 + tests/expectations/pdfinfo_attachments.txt | 1 + tests/expectations/toc.txt | 6 ++-- tests/expectations/toc_circular.txt | 5 +-- tests/expectations/toc_maxdepth.txt | 37 +++++++++++----------- 5 files changed, 27 insertions(+), 23 deletions(-) diff --git a/tests/expectations/attachments_list.txt b/tests/expectations/attachments_list.txt index 539fe1d54..92641d8d9 100644 --- a/tests/expectations/attachments_list.txt +++ b/tests/expectations/attachments_list.txt @@ -1,2 +1,3 @@ +Unsupported PDF feature: Attachment (incomplete support) [1] 1.txt [2] attached.pdf diff --git a/tests/expectations/pdfinfo_attachments.txt b/tests/expectations/pdfinfo_attachments.txt index 898f35b9c..1bfbeb373 100644 --- a/tests/expectations/pdfinfo_attachments.txt +++ b/tests/expectations/pdfinfo_attachments.txt @@ -1,3 +1,4 @@ +Unsupported PDF feature: Attachment (incomplete support) Page Count: 1 PDF Version: 1.6 ID (permanent): b'\xd8\x89\xebk\x9a\xdf\x88\xe5\xed\xa7\xdc\x08\xfe\x85\x97' diff --git a/tests/expectations/toc.txt b/tests/expectations/toc.txt index b635d42a4..bd6aa6b50 100644 --- a/tests/expectations/toc.txt +++ b/tests/expectations/toc.txt @@ -1,9 +1,9 @@ -[-] One -> 1 # XYZ [89.29, 757.7, 0.0] +[-2] One -> 1 # XYZ [89.29, 757.7, 0.0] [*] One-A -> 1 # XYZ [89.29, 706.86, 0.0] - [-] One-B -> 1 # XYZ [89.29, 657.03, 0.0] + [-2] One-B -> 1 # XYZ [89.29, 657.03, 0.0] [*] One-B-I -> 1 # XYZ [89.29, 607.2, 0.0] [*] One-B-II -> 1 # XYZ [89.29, 557.76, 0.0] [*] Two -> 1 # XYZ [89.29, 507.16, 0.0] -[-] Three -> 2 # XYZ [89.29, 757.7, 0.0] +[-2] Three -> 2 # XYZ [89.29, 757.7, 0.0] [*] Three-A -> 2 # XYZ [89.29, 706.98, 0.0] [*] Three-B -> 2 # XYZ [89.29, 657.15, 0.0] diff --git a/tests/expectations/toc_circular.txt b/tests/expectations/toc_circular.txt index 15142248c..984920d7f 100644 --- a/tests/expectations/toc_circular.txt +++ b/tests/expectations/toc_circular.txt @@ -1,2 +1,3 @@ -[*] A Good Beginning -> ? # ? [] -[*] A Good Ending -> ? # ? [] +[*] A Good Beginning -> _ +[*] A Good Ending -> _ +A circular bookmark reference was detected while traversing the table of contents. diff --git a/tests/expectations/toc_maxdepth.txt b/tests/expectations/toc_maxdepth.txt index 814bfceea..47a04509a 100644 --- a/tests/expectations/toc_maxdepth.txt +++ b/tests/expectations/toc_maxdepth.txt @@ -1,20 +1,21 @@ -[+] 1.outline -> 1 # FitH [746.439] - [+] 1.1.outline -> 1 # FitH [700.878] - [+] 1.1.1.outline -> 1 # FitH [632.537] - [+] 1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.outline -> 1 # FitH [597.304] - [+] 1.1.1.1.1.1outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] -[+] 2.outline -> 2 # FitH [749.4771] - [+] 2.1.outline -> 2 # FitH [699.36] - [+] 2.1.1.outline -> 2 # FitH [628.74] +[+100] 1.outline -> 1 # FitH [746.439] + [+100] 1.1.outline -> 1 # FitH [700.878] + [+1] 1.1.1.outline -> 1 # FitH [632.537] + [+1] 1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.outline -> 1 # FitH [597.304] + [+1] 1.1.1.1.1.1outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] +Maximum recursion depth 15 reached. Children beyond this scope are ignored. +[+100] 2.outline -> 2 # FitH [749.4771] + [+100] 2.1.outline -> 2 # FitH [699.36] + [+100] 2.1.1.outline -> 2 # FitH [628.74] [*] 2.1.1.1.outline -> 2 # FitH [583.179] [*] 2.2 outline -> 2 # FitH [515.218] From 847281ceac30e10f1f7c8bf7bf8dfc92b6c05f83 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Apr 2024 17:59:00 +0200 Subject: [PATCH 005/140] toc: better explain level == maxdepth scenario --- src/pypdfium2/_helpers/document.py | 3 ++- tests/expectations/toc_maxdepth.txt | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 629e4ad45..80fad8cbb 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -519,7 +519,8 @@ def get_toc( if level < max_depth-1: yield from self.get_toc(max_depth=max_depth, parent=bm_ptr, level=level+1, seen=seen) elif pdfium_c.FPDFBookmark_GetFirstChild(self, bm_ptr): - logger.warning(f"Maximum recursion depth {max_depth} reached. Children beyond this scope are ignored.") + # Warn only if there actually is a subtree. If level == max_depth but the tree ends there, it's fine as no information is skipped. + logger.warning(f"Maximum recursion depth {max_depth} reached (subtree skipped).") bm_ptr = pdfium_c.FPDFBookmark_GetNextSibling(self, bm_ptr) diff --git a/tests/expectations/toc_maxdepth.txt b/tests/expectations/toc_maxdepth.txt index 47a04509a..711731985 100644 --- a/tests/expectations/toc_maxdepth.txt +++ b/tests/expectations/toc_maxdepth.txt @@ -13,7 +13,7 @@ [+1] 1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] -Maximum recursion depth 15 reached. Children beyond this scope are ignored. +Maximum recursion depth 15 reached (subtree skipped). [+100] 2.outline -> 2 # FitH [749.4771] [+100] 2.1.outline -> 2 # FitH [699.36] [+100] 2.1.1.outline -> 2 # FitH [628.74] From f2352263a2a1a39c8e10b3c18b57c3f181e7963a Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Apr 2024 18:06:45 +0200 Subject: [PATCH 006/140] Start tracking changes --- docs/devel/changelog_staging.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index e41edd85a..9abd31065 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -4,3 +4,9 @@ # Changelog for next release +- PdfDocument.get_toc(): Replaced bookmark namedtuple `PdfOutlineItem` with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Also provide signed count as-is rather than needlessly splitting it in two variables (unsigned int `n_kids` and bool `is_closed`). + + From 4bfb46144195ba1a54921b59de92365e8fb88f9d Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Apr 2024 18:10:43 +0200 Subject: [PATCH 007/140] slightly improve docs for get_count() --- src/pypdfium2/_helpers/document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 80fad8cbb..2f9882b61 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -670,7 +670,7 @@ def get_title(self): def get_count(self): """ Returns: - int: Signed number of child bookmarks (fully recursive). Zero if the bookmark has no descendants. + int: Signed number of child bookmarks, recursively counting all members in the subtree. Zero if the bookmark has no descendants. The initial state shall be closed (collapsed) if negative, open (expanded) if positive. """ return pdfium_c.FPDFBookmark_GetCount(self) From ac7903f3d6793878da8eca1db45307e66fdcc43b Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Apr 2024 21:31:07 +0200 Subject: [PATCH 008/140] address various nits --- docs/devel/changelog_staging.md | 4 ++-- src/pypdfium2/_cli/toc.py | 4 ++-- src/pypdfium2/_helpers/document.py | 8 ++++---- tests/test_toc.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 9abd31065..2e1e10693 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -4,9 +4,9 @@ # Changelog for next release -- PdfDocument.get_toc(): Replaced bookmark namedtuple `PdfOutlineItem` with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Also provide signed count as-is rather than needlessly splitting it in two variables (unsigned int `n_kids` and bool `is_closed`). +- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Also provide signed count as-is rather than needlessly splitting in two variables (unsigned int `n_kids` and bool `is_closed`). diff --git a/src/pypdfium2/_cli/toc.py b/src/pypdfium2/_cli/toc.py index f05f50d6c..c19fe8749 100644 --- a/src/pypdfium2/_cli/toc.py +++ b/src/pypdfium2/_cli/toc.py @@ -31,10 +31,10 @@ def main(args): count, dest = bm.get_count(), bm.get_dest() out = " " * bm.level out += "[%s] %s -> " % ( - "*" if count == 0 else f"{count:+}", + f"{count:+}" if count != 0 else "*", bm.get_title(), ) - # distinguish between "no dest" and "dest with invalid values" while keeping result machine readable + # distinguish between "no dest" and "dest with unknown mode" while keeping result machine readable if dest: index, (view_mode, view_pos) = dest.get_index(), dest.get_view() out += "%s # %s %s" % ( diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 2f9882b61..1ba968e9a 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -402,7 +402,7 @@ def new_page(self, width, height, index=None): def del_page(self, index): """ Remove the page at *index* (zero-based). - It is recommended to close any open handles to the page before deleting it. + It is recommended to close any open handles to the page before calling this method. """ # FIXME not sure how pdfium would behave if the caller tries to access a handle to a deleted page... pdfium_c.FPDFPage_Delete(self, index) @@ -519,7 +519,7 @@ def get_toc( if level < max_depth-1: yield from self.get_toc(max_depth=max_depth, parent=bm_ptr, level=level+1, seen=seen) elif pdfium_c.FPDFBookmark_GetFirstChild(self, bm_ptr): - # Warn only if there actually is a subtree. If level == max_depth but the tree ends there, it's fine as no information is skipped. + # Warn only if there actually is a subtree. If level == max_depth but the tree ends there, it's fine as no info is skipped. logger.warning(f"Maximum recursion depth {max_depth} reached (subtree skipped).") bm_ptr = pdfium_c.FPDFBookmark_GetNextSibling(self, bm_ptr) @@ -606,7 +606,7 @@ def as_pageobject(self): Returns: PdfObject: An independent page object representation of the XObject. If multiple page objects are created from one XObject, they share resources. - Page objects created from an XObject remain valid after the XObject is closed. + Pageobjects created from an XObject remain valid after the XObject is closed. """ raw_pageobj = pdfium_c.FPDF_NewFormObjectFromXObject(self) return PdfObject( # not a child object (see above) @@ -651,7 +651,7 @@ class PdfBookmark (pdfium_i.AutoCastable): pdf (PdfDocument): Reference to the document this bookmark belongs to. level (int): - The bookmark's nesting level in the TOC tree. Corresponds to the number of parent bookmarks. + The bookmark's nesting level in the TOC tree (zero-based). Corresponds to the number of parent bookmarks. """ def __init__(self, raw, pdf, level): diff --git a/tests/test_toc.py b/tests/test_toc.py index b8e0869a5..a732822a3 100644 --- a/tests/test_toc.py +++ b/tests/test_toc.py @@ -42,7 +42,7 @@ def test_gettoc(): for bm in toc: dest = bm.get_dest() view_mode, view_pos = dest.get_view() - assert view_mode is pdfium_c.PDFDEST_VIEW_XYZ + assert view_mode == pdfium_c.PDFDEST_VIEW_XYZ assert round(view_pos[0]) == 89 # check last bookmark From 517630a9dc222eae144d85a468795beeff3db4dd Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Apr 2024 22:20:09 +0200 Subject: [PATCH 009/140] Continue on document and bitmap Removed PdfDocument.render() & PdfBitmapInfo. Implemented context manager support for PdfDocument. Test suite integration TBD. --- docs/devel/changelog_staging.md | 11 ++- src/pypdfium2/_cli/toc.py | 2 +- src/pypdfium2/_helpers/bitmap.py | 113 +++++++++++------------------ src/pypdfium2/_helpers/document.py | 47 ++++-------- 4 files changed, 64 insertions(+), 109 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 2e1e10693..03efeb76b 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -4,9 +4,16 @@ # Changelog for next release -- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Also provide signed count as-is rather than needlessly splitting in two variables (unsigned int `n_kids` and bool `is_closed`). + +*API-breaking changes* +- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also, distinguish between `dest == None` and an empty dest. +- Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). +- Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`. + +*Improvements and new features* +- Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. diff --git a/src/pypdfium2/_cli/toc.py b/src/pypdfium2/_cli/toc.py index c19fe8749..5425a33ea 100644 --- a/src/pypdfium2/_cli/toc.py +++ b/src/pypdfium2/_cli/toc.py @@ -34,7 +34,7 @@ def main(args): f"{count:+}" if count != 0 else "*", bm.get_title(), ) - # distinguish between "no dest" and "dest with unknown mode" while keeping result machine readable + # distinguish between "dest == None" and "dest with unknown mode" while keeping the output machine readable if dest: index, (view_mode, view_pos) = dest.get_index(), dest.get_view() out += "%s # %s %s" % ( diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index 3fe4c1e30..b1de99244 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -1,12 +1,10 @@ # SPDX-FileCopyrightText: 2024 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -__all__ = ("PdfBitmap", "PdfBitmapInfo") +__all__ = ("PdfBitmap", ) import ctypes import logging -import weakref -from collections import namedtuple import pypdfium2.raw as pdfium_c import pypdfium2.internal as pdfium_i from pypdfium2._helpers.misc import PdfiumError @@ -28,16 +26,8 @@ class PdfBitmap (pdfium_i.AutoCloseable): """ Bitmap helper class. - Hint: - This class provides built-in converters (e. g. :meth:`.to_pil`, :meth:`.to_numpy`) that may be used to create a different representation of the bitmap. - Converters can be applied on :class:`.PdfBitmap` objects either as bound method (``bitmap.to_*()``), or as function (``PdfBitmap.to_*(bitmap)``) - The second pattern is useful for API methods that need to apply a caller-provided converter (e. g. :meth:`.PdfDocument.render`) - .. _PIL Modes: https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes - Note: - All attributes of :class:`.PdfBitmapInfo` are available in this class as well. - Warning: ``bitmap.close()``, which frees the buffer of foreign bitmaps, is not validated for safety. A bitmap must not be closed when other objects still depend on its buffer! @@ -47,13 +37,36 @@ class PdfBitmap (pdfium_i.AutoCloseable): The underlying PDFium bitmap handle. buffer (~ctypes.c_ubyte): A ctypes array representation of the pixel data (each item is an unsigned byte, i. e. a number ranging from 0 to 255). + width (int): + Width of the bitmap (horizontal size). + height (int): + Height of the bitmap (vertical size). + stride (int): + Number of bytes per line in the bitmap buffer. + Depending on how the bitmap was created, there may be a padding of unused bytes at the end of each line, so this value can be greater than ``width * n_channels``. + format (int): + PDFium bitmap format constant (:attr:`FPDFBitmap_*`) + rev_byteorder (bool): + Whether the bitmap is using reverse byte order. + n_channels (int): + Number of channels per pixel. + mode (str): + The bitmap format as string (see `PIL Modes`_). """ def __init__(self, raw, buffer, width, height, stride, format, rev_byteorder, needs_free): - self.raw, self.buffer, self.width, self.height = raw, buffer, width, height - self.stride, self.format, self.rev_byteorder = stride, format, rev_byteorder + self.raw = raw + self.buffer = buffer + self.width = width + self.height = height + self.stride = stride + self.format = format + self.rev_byteorder = rev_byteorder self.n_channels = pdfium_i.BitmapTypeToNChannels[self.format] - self.mode = (pdfium_i.BitmapTypeToStrReverse if self.rev_byteorder else pdfium_i.BitmapTypeToStr)[self.format] + self.mode = { + False: pdfium_i.BitmapTypeToStr, + True: pdfium_i.BitmapTypeToStrReverse, + }[self.rev_byteorder][self.format] super().__init__(pdfium_c.FPDFBitmap_Destroy, needs_free=needs_free, obj=self.buffer) @@ -61,18 +74,6 @@ def __init__(self, raw, buffer, width, height, stride, format, rev_byteorder, ne def parent(self): # AutoCloseable hook return None - - def get_info(self): - """ - Returns: - PdfBitmapInfo: A namedtuple describing the bitmap. - """ - return PdfBitmapInfo( - width=self.width, height=self.height, stride=self.stride, format=self.format, - rev_byteorder=self.rev_byteorder, n_channels=self.n_channels, mode=self.mode, - ) - - @classmethod def from_raw(cls, raw, rev_byteorder=False, ex_buffer=None): """ @@ -95,7 +96,7 @@ def from_raw(cls, raw, rev_byteorder=False, ex_buffer=None): if ex_buffer is None: needs_free = True buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(raw) - if buffer_ptr is None: + if not buffer_ptr: raise PdfiumError("Failed to get bitmap buffer (null pointer returned)") buffer = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte * (stride * height))).contents else: @@ -108,15 +109,15 @@ def from_raw(cls, raw, rev_byteorder=False, ex_buffer=None): ) - # TODO support setting stride if external buffer is provided @classmethod - def new_native(cls, width, height, format, rev_byteorder=False, buffer=None): + def new_native(cls, width, height, format, rev_byteorder=False, buffer=None, stride=None): """ Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by Python/ctypes. Bitmaps created by this function are always packed (no unused bytes at line end). """ - stride = width * pdfium_i.BitmapTypeToNChannels[format] + if stride is None: + stride = width * pdfium_i.BitmapTypeToNChannels[format] if buffer is None: buffer = (ctypes.c_ubyte * (stride * height))() raw = pdfium_c.FPDFBitmap_CreateEx(width, height, format, buffer, stride) @@ -211,14 +212,12 @@ def to_pil(self): """ Convert the bitmap to a :mod:`PIL` image, using :func:`PIL.Image.frombuffer`. - For ``RGBA``, ``RGBX`` and ``L`` buffers, PIL is supposed to share memory with - the original bitmap buffer, so changes to the buffer should be reflected in the image, and vice versa. + For ``RGBA``, ``RGBX`` and ``L`` bitmaps, PIL is supposed to share memory with + the original buffer, so changes to the buffer should be reflected in the image, and vice versa. Otherwise, PIL will make a copy of the data. Returns: PIL.Image.Image: PIL image (representation or copy of the bitmap buffer). - - .. versionchanged:: 4.16 Set ``image.readonly = False`` so that changes to the image are also reflected in the buffer. """ # https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.frombuffer @@ -234,45 +233,38 @@ def to_pil(self): self.stride, # bytes per line 1, # orientation (top->bottom) ) + # set `readonly = False` so changes to the image are reflected in the buffer, if the original buffer is used image.readonly = False return image @classmethod - def from_pil(cls, pil_image, recopy=False): + def from_pil(cls, pil_image): """ Convert a :mod:`PIL` image to a PDFium bitmap. - Due to the restricted number of color formats and bit depths supported by PDFium's - bitmap implementation, this may be a lossy operation. + Due to the restricted number of color formats and bit depths supported by FPDF_BITMAP, this may be a lossy operation. - Bitmaps returned by this function should be treated as immutable (i.e. don't call :meth:`.fill_rect`). + Bitmaps returned by this function should be treated as immutable. Parameters: pil_image (PIL.Image.Image): The image. Returns: PdfBitmap: PDFium bitmap (with a copy of the PIL image's data). - - .. deprecated:: 4.25 - The *recopy* parameter has been deprecated. """ + # FIXME possibility to get mutable buffer from PIL image? + if pil_image.mode in pdfium_i.BitmapStrToConst: - # PIL always seems to represent BGR(A/X) input as RGB(A/X), so this code passage is probably only hit for L + # PIL always seems to represent BGR(A/X) input as RGB(A/X), so this code passage would only be reached for L format = pdfium_i.BitmapStrToConst[pil_image.mode] else: pil_image = _pil_convert_for_pdfium(pil_image) format = pdfium_i.BitmapStrReverseToConst[pil_image.mode] - py_buffer = pil_image.tobytes() - if recopy: - buffer = (ctypes.c_ubyte * len(py_buffer)).from_buffer_copy(py_buffer) - else: - buffer = py_buffer - w, h = pil_image.size - return cls.new_native(w, h, format, rev_byteorder=False, buffer=buffer) + return cls.new_native(w, h, format, rev_byteorder=False, buffer=pil_image.tobytes()) # TODO implement from_numpy() @@ -280,8 +272,6 @@ def from_pil(cls, pil_image, recopy=False): def _pil_convert_for_pdfium(pil_image): - # FIXME? convoluted / hard to understand; improve control flow - if pil_image.mode == "1": pil_image = pil_image.convert("L") elif pil_image.mode.startswith("RGB"): @@ -304,24 +294,3 @@ def _pil_convert_for_pdfium(pil_image): pil_image = PIL.Image.merge("RGBX", (b, g, r, x)) return pil_image - - -PdfBitmapInfo = namedtuple("PdfBitmapInfo", "width height stride format rev_byteorder n_channels mode") -""" -Attributes: - width (int): - Width of the bitmap (horizontal size). - height (int): - Height of the bitmap (vertical size). - stride (int): - Number of bytes per line in the bitmap buffer. - Depending on how the bitmap was created, there may be a padding of unused bytes at the end of each line, so this value can be greater than ``width * n_channels``. - format (int): - PDFium bitmap format constant (:attr:`FPDFBitmap_*`) - rev_byteorder (bool): - Whether the bitmap is using reverse byte order. - n_channels (int): - Number of channels per pixel. - mode (str): - The bitmap format as string (see `PIL Modes`_). -""" diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 1ba968e9a..2cb8b7e3f 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -42,9 +42,11 @@ class PdfDocument (pdfium_i.AutoCloseable): FileNotFoundError: Raised if an invalid or non-existent file path was given. Hint: + * Documents may be used in a ``with``-block, closing the document on context manager exit. + This is recommended when *input_data* is a file path, to safely and immediately release the opened file handle. * :func:`len` may be called to get a document's number of pages. - * Looping over a document will yield its pages from beginning to end. * Pages may be loaded using list index access. + * Looping over a document will yield its pages from beginning to end. * The ``del`` keyword and list index access may be used to delete pages. Attributes: @@ -68,8 +70,6 @@ def __init__(self, input, password=None, autoclose=False): self._autoclose = autoclose self._data_holder = [] self._data_closer = [] - - # question: can we make attributes like formenv effectively immutable for the caller? self.formenv = None if isinstance(self._input, pdfium_c.FPDF_DOCUMENT): @@ -82,6 +82,16 @@ def __init__(self, input, password=None, autoclose=False): super().__init__(PdfDocument._close_impl, self._data_holder, self._data_closer) + # Support using PdfDocument in a with-block + # Note that pdfium objects have to be closed in hierarchial order, but as this is ensured by the parents/kids system, callers don't have to worry about that. + + def __enter__(self): + return self + + def __exit__(self, *_): + self.close() + + def __repr__(self): if isinstance(self._input, Path): input_r = repr( str(self._input) ) @@ -523,37 +533,6 @@ def get_toc( logger.warning(f"Maximum recursion depth {max_depth} reached (subtree skipped).") bm_ptr = pdfium_c.FPDFBookmark_GetNextSibling(self, bm_ptr) - - - def render( - self, - converter, - renderer = PdfPage.render, - page_indices = None, - pass_info = False, - n_processes = None, # ignored, retained for compat - mk_formconfig = None, # ignored, retained for compat - **kwargs - ): - """ - .. deprecated:: 4.19 - This method will be removed with the next major release due to serious issues rooted in the original API design. Use :meth:`.PdfPage.render()` instead. - *Note that the CLI provides parallel rendering using a proper caller-side process pool with inline saving in rendering jobs.* - - .. versionchanged:: 4.25 - Removed the original process pool implementation and turned this into a wrapper for linear rendering, due to the serious conceptual issues and possible memory load escalation, especially with expensive receiving code (e.g. PNG encoding) or long documents. See the changelog for more info - """ - - warnings.warn("The document-level pdf.render() API is deprecated and uncored due to serious issues in the original concept. Use page.render() and a caller-side loop or process pool instead.", category=DeprecationWarning) - - if not page_indices: - page_indices = [i for i in range(len(self))] - for i in page_indices: - bitmap = renderer(self[i], **kwargs) - if pass_info: - yield (converter(bitmap), bitmap.get_info()) - else: - yield converter(bitmap) class PdfFormEnv (pdfium_i.AutoCloseable): From 677c4984eee092cf3bb7665a38a948bfe5227adc Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Apr 2024 23:02:25 +0200 Subject: [PATCH 010/140] Work on `PdfImage.extract()` --- docs/devel/changelog_staging.md | 4 ++- src/pypdfium2/_cli/extract_images.py | 4 +-- src/pypdfium2/_helpers/pageobjects.py | 39 +++++++++++---------------- 3 files changed, 21 insertions(+), 26 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 03efeb76b..f7389f560 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -6,9 +6,11 @@ # Changelog for next release *API-breaking changes* -- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also, distinguish between `dest == None` and an empty dest. - Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). + Instead, use `PdfPage.render()` with a loop or process pool. - Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`. +- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest. +- Removed `fb_render` parameter from `PdfImage.extract()` because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place. *Improvements and new features* - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. diff --git a/src/pypdfium2/_cli/extract_images.py b/src/pypdfium2/_cli/extract_images.py index 6fd94569e..6091d3489 100644 --- a/src/pypdfium2/_cli/extract_images.py +++ b/src/pypdfium2/_cli/extract_images.py @@ -37,7 +37,7 @@ def attach(parser): parser.add_argument( "--render", action = "store_true", - help = "Whether to get rendered bitmaps, taking masks and transform matrices into account. (Fallback if doing smart extraction.)", + help = "Whether to get rendered bitmaps, taking masks and transform matrices into account. (requires --use-bitmap, ignored with smart extraction)", ) @@ -71,7 +71,7 @@ def main(args): pil_image = image.get_bitmap(render=args.render).to_pil() pil_image.save( prefix.with_suffix("."+args.format) ) else: - image.extract(prefix, fb_format=args.format, fb_render=args.render) + image.extract(prefix, fb_format=args.format) except pdfium.PdfiumError: traceback.print_exc() image.close() diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 2be708f1a..a9702f2d2 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -23,8 +23,7 @@ class PdfObject (pdfium_i.AutoCloseable): """ Page object helper class. - When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead, - depending on the object's :attr:`.type` (e. g. :class:`.PdfImage`). + When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead, depending on the object's :attr:`.type` (e. g. :class:`.PdfImage`). Attributes: raw (FPDF_PAGEOBJECT): @@ -121,15 +120,13 @@ def transform(self, matrix): pdfium_c.FPDFPageObj_Transform(self, *matrix.get()) -# In principle, we would like to move PdfImage to a separate file, but it's not that easy because of the two-fold connection with PdfObject, which would run us into a circular import. (However, what we could do is externalize the class under a different name and turn PdfImage into a wrapper which merely inherits from that class.) - class PdfImage (PdfObject): """ Image object helper class (specific kind of page object). """ # cf. https://crbug.com/pdfium/1203 - #: Filters applied by :func:`FPDFImageObj_GetImageDataDecoded`. Hereafter referred to as "simple filters", while non-simple filters will be called "complex filters". + #: Filters applied by :func:`FPDFImageObj_GetImageDataDecoded`, referred to as "simple filters". Other filters are considered "complex filters". SIMPLE_FILTERS = ("ASCIIHexDecode", "ASCII85Decode", "RunLengthDecode", "FlateDecode", "LZWDecode") @@ -141,7 +138,7 @@ def new(cls, pdf): Returns: PdfImage: Handle to a new, empty image. Note that position and size of the image are defined by its matrix, which defaults to the identity matrix. - This means that new images will appear as a tiny square of 1x1 units on the bottom left corner of the page. + This means that new images will appear as a tiny square of 1x1 canvas units on the bottom left corner of the page. Use :class:`.PdfMatrix` and :meth:`.set_matrix` to adjust size and position. """ raw_img = pdfium_c.FPDFPageObj_NewImageObj(pdf) @@ -155,7 +152,7 @@ def get_metadata(self): Note: * The DPI values signify the resolution of the image on the PDF page, not the DPI metadata embedded in the image file. - * Due to issues in PDFium, this function can be slow. If you only need image size, prefer the faster :meth:`.get_size` instead. + * Due to issues in pdfium, this function might be slow on some kinds of images. If you only need size, prefer :meth:`.get_size` instead. Returns: FPDF_IMAGEOBJ_METADATA: Image metadata structure @@ -170,8 +167,6 @@ def get_metadata(self): def get_size(self): """ - .. versionadded:: 4.8/5731 - Returns: (int, int): Image dimensions as a tuple of (width, height). """ @@ -310,23 +305,21 @@ def get_filters(self, skip_simple=False): def extract(self, dest, *args, **kwargs): - # TODO rewrite/simplify docstring """ - Extract the image into an independently usable file or byte buffer. - Where possible within PDFium's limited public API, it will be attempted to transfer the image data directly, - avoiding an unnecessary layer of decoding and re-encoding. - Otherwise, the fully decoded data will be retrieved and (re-)encoded using :mod:`PIL`. + Extract the image into an independently usable file or byte buffer, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits. - As PDFium does not expose all required information, only DCTDecode (JPEG) and JPXDecode (JPEG 2000) images can be extracted directly. - For images with complex filters, the bitmap data is used. Otherwise, ``get_data(decode_simple=True)`` is used, which avoids lossy conversion for images whose bit depth or colour format is not supported by PDFium's bitmap implementation. + Only DCTDecode (JPEG) and JPXDecode (JPEG 2000) images can be extracted directly. + Otherwise, the pixel data is decoded, and re-encoded using :mod:`PIL`. + For images with simple filters only, ``get_data(decode_simple=True)`` is used for decoding to preserve higher bit depth or special color formats not supported by FPDF_BITMAP. + For images with complex filters, we have to resort to :meth:`.get_bitmap`, which can be a lossy operation. + + Note, this method ignores alpha masks and some other data stored separately from the main data stream (e.g. BlackIsWhite), which might lead to incorrect representation of the image. Parameters: dest (str | io.BytesIO): File prefix or byte buffer to which the image shall be written. fb_format (str): The image format to use in case it is necessary to (re-)encode the data. - fb_render (bool): - Whether the image should be rendered if falling back to bitmap-based extraction. """ # https://crbug.com/pdfium/1930 @@ -367,15 +360,13 @@ def _get_pil_mode(colorspace, bpp): return None -def _extract_smart(image_obj, fb_format=None, fb_render=False): - - # FIXME somewhat hard to read... +def _extract_smart(image_obj, fb_format=None): try: data, info = _extract_direct(image_obj) except ImageNotExtractableError: # TODO? log reason why the image cannot be extracted directly - pil_image = image_obj.get_bitmap(render=fb_render).to_pil() + pil_image = image_obj.get_bitmap(render=False).to_pil() else: pil_image = None format = info.format @@ -389,7 +380,9 @@ def _extract_smart(image_obj, fb_format=None, fb_render=False): ) if pil_image: - format = fb_format if fb_format else "tiff" if pil_image.mode == "CMYK" else "png" + format = fb_format + if not format: + format = {"CMYK": "tiff"}.get(pil_image.mode, "png") buffer = yield format pil_image.save(buffer, format=format) if pil_image else buffer.write(data) From 4de863d8099d686a844815d8b5b43fe8bcecadc5 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 5 Apr 2024 00:02:27 +0200 Subject: [PATCH 011/140] Fix some object pointer checks against None Use bool() rather than checking against None. See findings in get_toc(): "We need bool(ptr) here to handle cases where .contents is a null pointer (raises exception on access). Don't use ptr != None, it's always true." --- src/pypdfium2/_helpers/document.py | 2 +- src/pypdfium2/_helpers/page.py | 2 +- src/pypdfium2/_helpers/pageobjects.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 2cb8b7e3f..c518603b2 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -486,7 +486,7 @@ def page_as_xobject(self, index, dest_pdf): PdfXObject: The page as XObject. """ raw_xobject = pdfium_c.FPDF_NewXObjectFromPage(dest_pdf, self, index) - if raw_xobject is None: + if not raw_xobject: raise PdfiumError(f"Failed to capture page at index {index} as FPDF_XOBJECT.") xobject = PdfXObject(raw=raw_xobject, pdf=dest_pdf) self._add_kid(xobject) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 38c7ab4be..f536b6e5b 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -280,7 +280,7 @@ def get_objects(self, filter=None, max_depth=2, form=None, level=0): for i in range(n_objects): raw_obj = get_object(parent, i) - if raw_obj is None: + if not raw_obj: raise PdfiumError("Failed to get page object.") helper_obj = PdfObject(raw_obj, page=self, pdf=self.pdf, level=level) diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index a9702f2d2..0d712fec3 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -258,7 +258,7 @@ def get_bitmap(self, render=False): else: raw_bitmap = pdfium_c.FPDFImageObj_GetBitmap(self) - if raw_bitmap is None: + if not raw_bitmap: raise PdfiumError(f"Failed to get bitmap of image {self}.") return PdfBitmap.from_raw(raw_bitmap) From ccfe92358b44af604f09c9f9fc932826e09a0931 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 5 Apr 2024 00:06:53 +0200 Subject: [PATCH 012/140] Address `run check` findings --- README.md | 2 +- docs/devel/changelog.md | 4 ++-- src/pypdfium2/_helpers/document.py | 5 ----- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index d052a55d1..adad42e55 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct * From source 🔗 * Dependencies: - - System: git, C pre-processor (gcc/clang - alternatively, specify the command to envoke via `$CPP`) + - System: git, C pre-processor (gcc/clang - alternatively, specify the command to invoke via `$CPP`) - Python: ctypesgen (pypdfium2-team fork), wheel, setuptools. Usually installed automatically. * Get the code diff --git a/docs/devel/changelog.md b/docs/devel/changelog.md index efe398cd3..3c1c7b7d3 100644 --- a/docs/devel/changelog.md +++ b/docs/devel/changelog.md @@ -23,7 +23,7 @@ ## 4.26.0 (2024-01-10) - Updated PDFium from `6164` to `6233`. -- Pin ctypesgen in sdist to prevent reoccurrence of {issue}`264` / {issue}`286`. As a drawback, the pin is never committed, so the sdist is not simply reproducible at this time due to dependence on the latest commit hash of the ctypesgen fork at build time. +- Pin ctypesgen in sdist to prevent re-occurrence of {issue}`264` / {issue}`286`. As a drawback, the pin is never committed, so the sdist is not simply reproducible at this time due to dependence on the latest commit hash of the ctypesgen fork at build time. - Wheel tags: Added back `manylinux2014` in addition to `manylinux_{glibc_ver}` to be on the safe side. Suspected relation to the above issues. @@ -44,7 +44,7 @@ - The parallel rendering API unfortunately was an inherent design mistake: Multiprocessing is not meant to transfer large amounts of pixel data from workers to the main process. - This was such a heavy drawback that it basically outweighed the parallelization, so there was no real performance advantage, only higher memory load. -- As a related problem, the worker pool produces bitmaps at an indepedent speed, regardless of where the receiving iteration might be, so bitmaps could queue up in memory, possibly causing an enormeous rise in memory consumption over time. This effect was pronounced e.g. with PNG saving via PIL, as exhibited in Facebook's `nougat` project. +- As a related problem, the worker pool produces bitmaps at an independent speed, regardless of where the receiving iteration might be, so bitmaps could queue up in memory, possibly causing an enormeous rise in memory consumption over time. This effect was pronounced e.g. with PNG saving via PIL, as exhibited in Facebook's `nougat` project. - Instead, each bitmap should be processed (e.g. saved) in the job which created it. Only a minimal, final result should be sent back to the main process (e.g. a file path). - This means we cannot reasonably provide a generic parallel renderer, instead it needs to be implemented by callers. - Historically, note that there had been even more faults in the implementation: diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index c518603b2..81eed78f3 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -3,14 +3,9 @@ __all__ = ("PdfDocument", "PdfFormEnv", "PdfXObject", "PdfBookmark", "PdfDest") -import os import ctypes import logging -import inspect -import warnings from pathlib import Path -from collections import namedtuple -import multiprocessing as mp import pypdfium2.raw as pdfium_c import pypdfium2.internal as pdfium_i From 2360165e2cea18aae0856167e3549ba1bcf846d0 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 5 Apr 2024 00:14:15 +0200 Subject: [PATCH 013/140] Expand constructor assignments This is longer, but cleaner. Imagine you have to edit it and assignment order gets wrong :P BTW, normalize PdfFormEnv constructor param order. --- src/pypdfium2/_helpers/attachment.py | 3 ++- src/pypdfium2/_helpers/document.py | 18 ++++++++++++------ src/pypdfium2/_helpers/matrix.py | 3 --- src/pypdfium2/_helpers/page.py | 4 +++- src/pypdfium2/_helpers/pageobjects.py | 5 ++++- 5 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/pypdfium2/_helpers/attachment.py b/src/pypdfium2/_helpers/attachment.py index 5b69e62b6..ef5f45457 100644 --- a/src/pypdfium2/_helpers/attachment.py +++ b/src/pypdfium2/_helpers/attachment.py @@ -36,7 +36,8 @@ class PdfAttachment (pdfium_i.AutoCastable): def __init__(self, raw, pdf): - self.raw, self.pdf = raw, pdf + self.raw = raw + self.pdf = pdf def get_name(self): diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 81eed78f3..75ef2ef90 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -172,7 +172,7 @@ def init_forms(self, config=None): raw = pdfium_c.FPDFDOC_InitFormFillEnvironment(self, config) if not raw: raise PdfiumError(f"Initializing form env failed for document {self}.") - self.formenv = PdfFormEnv(raw, config, self) + self.formenv = PdfFormEnv(raw, self, config) self._add_kid(self.formenv) if formtype in (pdfium_c.FORMTYPE_XFA_FULL, pdfium_c.FORMTYPE_XFA_FOREGROUND): @@ -543,8 +543,10 @@ class PdfFormEnv (pdfium_i.AutoCloseable): Parent document this form env belongs to. """ - def __init__(self, raw, config, pdf): - self.raw, self.config, self.pdf = raw, config, pdf + def __init__(self, raw, pdf, config): + self.raw = raw + self.pdf = pdf + self.config = config super().__init__(PdfFormEnv._close_impl, self.config, self.pdf) @property @@ -568,7 +570,8 @@ class PdfXObject (pdfium_i.AutoCloseable): """ def __init__(self, raw, pdf): - self.raw, self.pdf = raw, pdf + self.raw = raw + self.pdf = pdf super().__init__(pdfium_c.FPDF_CloseXObject) @property @@ -629,7 +632,9 @@ class PdfBookmark (pdfium_i.AutoCastable): """ def __init__(self, raw, pdf, level): - self.raw, self.pdf, self.level = raw, pdf, level + self.raw = raw + self.pdf = pdf + self.level = level def get_title(self): """ @@ -670,7 +675,8 @@ class PdfDest (pdfium_i.AutoCastable): """ def __init__(self, raw, pdf): - self.raw, self.pdf = raw, pdf + self.raw = raw + self.pdf = pdf def get_index(self): """ diff --git a/src/pypdfium2/_helpers/matrix.py b/src/pypdfium2/_helpers/matrix.py index 4d9aff402..a8cab515d 100644 --- a/src/pypdfium2/_helpers/matrix.py +++ b/src/pypdfium2/_helpers/matrix.py @@ -40,17 +40,14 @@ class PdfMatrix: def __init__(self, a=1, b=0, c=0, d=1, e=0, f=0): self.a, self.b, self.c, self.d, self.e, self.f = a, b, c, d, e, f - def __repr__(self): return f"PdfMatrix{self.get()}" - def __eq__(self, other): if type(self) is not type(other): return False return (self.get() == other.get()) - @property def _as_parameter_(self): return ctypes.byref( self.to_raw() ) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index f536b6e5b..e0dd5d42c 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -27,7 +27,9 @@ class PdfPage (pdfium_i.AutoCloseable): """ def __init__(self, raw, pdf, formenv): - self.raw, self.pdf, self.formenv = raw, pdf, formenv + self.raw = raw + self.pdf = pdf + self.formenv = formenv super().__init__(PdfPage._close_impl, self.formenv) diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 0d712fec3..e83bbf5c7 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -55,7 +55,10 @@ def __new__(cls, raw, *args, **kwargs): def __init__(self, raw, page=None, pdf=None, level=0): - self.raw, self.page, self.pdf, self.level = raw, page, pdf, level + self.raw = raw + self.page = page + self.pdf = pdf + self.level = level if page is not None: if self.pdf is None: From c581f5af2614cbd74e3b1fbd785f1d9e7b6bee53 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 5 Apr 2024 00:22:06 +0200 Subject: [PATCH 014/140] autorelease: add task --- setupsrc/pypdfium2_setup/autorelease.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setupsrc/pypdfium2_setup/autorelease.py b/setupsrc/pypdfium2_setup/autorelease.py index 520f712c4..85d94639a 100644 --- a/setupsrc/pypdfium2_setup/autorelease.py +++ b/setupsrc/pypdfium2_setup/autorelease.py @@ -140,6 +140,7 @@ def make_releasenotes(summary, prev_pdfium, new_pdfium, prev_tag, new_tag, c_upd if c_updates: with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) + # FIXME seems to take rather long - possibility to limit history size? run_cmd(["git", "clone", "--filter=blob:none", "--no-checkout", PdfiumURL, "pdfium_history"], cwd=tmpdir) relnotes += _get_log( "PDFium", PdfiumURL, tmpdir/"pdfium_history", From 81f2b4af50144ba75bf12ef06bd82ddbb5863534 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 5 Apr 2024 00:26:31 +0200 Subject: [PATCH 015/140] slightly improve wording for v4.25 changelog --- docs/devel/changelog.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/devel/changelog.md b/docs/devel/changelog.md index 3c1c7b7d3..f1e08d3a0 100644 --- a/docs/devel/changelog.md +++ b/docs/devel/changelog.md @@ -43,10 +43,10 @@ #### Rationale for `PdfDocument.render()` deprecation - The parallel rendering API unfortunately was an inherent design mistake: Multiprocessing is not meant to transfer large amounts of pixel data from workers to the main process. -- This was such a heavy drawback that it basically outweighed the parallelization, so there was no real performance advantage, only higher memory load. -- As a related problem, the worker pool produces bitmaps at an independent speed, regardless of where the receiving iteration might be, so bitmaps could queue up in memory, possibly causing an enormeous rise in memory consumption over time. This effect was pronounced e.g. with PNG saving via PIL, as exhibited in Facebook's `nougat` project. +- Bitmap transfer is so expensive that it essentially outweighed parallelization, so there was no real performance advantage, only higher memory load. +- As a related problem, the worker pool produces bitmaps at an independent speed, regardless of where the receiving iteration might be, so bitmaps could queue up in memory, possibly causing an enormeous rise in memory consumption over time. This effect was pronounced e.g. with PNG saving via PIL, as seen in Facebook's `nougat` project. - Instead, each bitmap should be processed (e.g. saved) in the job which created it. Only a minimal, final result should be sent back to the main process (e.g. a file path). -- This means we cannot reasonably provide a generic parallel renderer, instead it needs to be implemented by callers. +- This means we cannot reasonably provide a generic parallel renderer; instead it needs to be implemented by callers. - Historically, note that there had been even more faults in the implementation: * Prior to `4.22.0`, the pool was always initialized with `os.cpu_count()` processes by default, even when rendering less pages. * Prior to `4.20.0`, a full-scale input transfer was conducted on each job (rendering it unusable with bytes input). However, this can and should be done only once on process creation. From ddc3f3a036c57f6b653ac0119845862f852413f5 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 5 Apr 2024 14:39:04 +0200 Subject: [PATCH 016/140] Remove deprecated version API --- docs/devel/changelog_staging.md | 3 ++- src/pypdfium2/version.py | 25 ++++--------------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index f7389f560..c2ddea672 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -10,7 +10,8 @@ Instead, use `PdfPage.render()` with a loop or process pool. - Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`. - `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest. -- Removed `fb_render` parameter from `PdfImage.extract()` because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place. +- Removed `fb_render` param from `PdfImage.extract()` because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place. +- Removed some deprecated members/params (e.g. legacy version flags, `recopy` of `PdfBitmap.from_pil()`) *Improvements and new features* - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py index 7dd604a25..dfbab36af 100644 --- a/src/pypdfium2/version.py +++ b/src/pypdfium2/version.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2024 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -__all__ = [] +__all__ = ("PYPDFIUM_INFO", "PDFIUM_INFO") import sys import json @@ -11,7 +11,8 @@ import pypdfium2_raw -# TODO move to shared compat file +# TODO remove caching and just assign everything on init/lib startup + if sys.version_info < (3, 8): def cached_property(func): return property( functools.lru_cache(maxsize=1)(func) ) @@ -116,29 +117,11 @@ def desc(self): # TODO(future) add bindings info (e.g. ctypesgen version, reference/generated, runtime libdirs) -# Current API +# API PYPDFIUM_INFO = _version_pypdfium2() PDFIUM_INFO = _version_pdfium() -__all__ += ["PYPDFIUM_INFO", "PDFIUM_INFO"] - -# ----- - - -# Deprecated API, to be removed with v5 -# Known issue: causes eager evaluation of the new API's theoretically deferred properties. - -V_PYPDFIUM2 = PYPDFIUM_INFO.version -V_LIBPDFIUM = str(PDFIUM_INFO.build) -V_BUILDNAME = PDFIUM_INFO.origin -V_PDFIUM_IS_V8 = "V8" in PDFIUM_INFO.flags # implies XFA -V_LIBPDFIUM_FULL = PDFIUM_INFO.version - -__all__ += ["V_PYPDFIUM2", "V_LIBPDFIUM", "V_LIBPDFIUM_FULL", "V_BUILDNAME", "V_PDFIUM_IS_V8"] - -# ----- - # Docs From 3acc545704f7b24d3cc918d179005ec408dc035e Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 5 Apr 2024 15:09:40 +0200 Subject: [PATCH 017/140] Simplify version impl I figured the deferred API doesn't make much sense, because this should compute quickly anyway. Caching is more for expensive properties. Also, we have to access the pdfium info on init for XFA/V8 checks, so it never would be truly deferred anyway. --- docs/devel/changelog_staging.md | 1 + src/pypdfium2/version.py | 98 +++++++++++---------------------- 2 files changed, 32 insertions(+), 67 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index c2ddea672..ed89388cf 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -15,6 +15,7 @@ *Improvements and new features* - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. +- Simplified version implementation (no API change expected). All attributes are now assigned and show up in `dir(...)`, instead of `__getattr__` magic. * Read the table of contents ```python - for item in pdf.get_toc(): - state = "*" if item.n_kids == 0 else "-" if item.is_closed else "+" - target = "?" if item.page_index is None else item.page_index+1 - print( - " " * item.level + - "[%s] %s -> %s # %s %s" % ( - state, item.title, target, item.view_mode, item.view_pos, - ) + import pypdfium2.internal as pdfium_i + + for bm in pdf.get_toc(max_depth=15): + count, dest = bm.get_count(), bm.get_dest() + out = " " * bm.level + out += "[%s] %s -> " % ( + f"{count:+}" if count != 0 else "*", + bm.get_title(), ) + if dest: + index, (view_mode, view_pos) = dest.get_index(), dest.get_view() + out += "%s # %s %s" % ( + index+1 if index != None else "?", + pdfium_i.ViewmodeToStr.get(view_mode), + round(view_pos, 3), + ) + else: + out += "_" + print(out) ``` * Create a new PDF with an empty A4 sized page diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 352bf87f1..152aa8582 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -22,7 +22,7 @@ - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object. - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`. -- Simplified version implementation (no API change expected). All attributes are now assigned and show up in `dir(...)`, instead of `__getattr__` magic. +- Simplified version implementation (no API change expected). Replaced `__getattr__` magic with assignments, so all attributes now show up in `dir()`. +- Added `PdfPosConv` helper and `PdfPage.get_posconv(bitmap)` for bidirectional translation between page and bitmap coordinates. - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object. - Added `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added an assertion to make sure requirements are met, and updated docs accordingly. - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. From cd064b94e85774678c6b00e1bbe6cf223d319a06 Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 9 Apr 2024 16:57:57 +0200 Subject: [PATCH 036/140] Move get_posconv() to bitmap --- docs/devel/changelog_staging.md | 3 +- src/pypdfium2/_helpers/bitmap.py | 53 ++++++++++++++++++++++++++++++-- src/pypdfium2/_helpers/page.py | 52 +------------------------------ tests/test_rendering.py | 2 +- 4 files changed, 54 insertions(+), 56 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index b9980f85c..909d28b77 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -19,8 +19,7 @@ - Removed legacy version flags. *Improvements and new features* - -- Added `PdfPosConv` helper and `PdfPage.get_posconv(bitmap)` for bidirectional translation between page and bitmap coordinates. +- Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates. - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object. - Added `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added an assertion to make sure requirements are met, and updated docs accordingly. - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index 1e7a9b1c6..f98fe76dd 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2024 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -__all__ = ("PdfBitmap", ) +__all__ = ("PdfBitmap", "PdfPosConv") import ctypes import logging @@ -79,6 +79,7 @@ def __init__(self, raw, buffer, width, height, stride, format, rev_byteorder, ne def parent(self): # AutoCloseable hook return None + @classmethod def from_raw(cls, raw, rev_byteorder=False, ex_buffer=None): """ @@ -275,7 +276,16 @@ def from_pil(cls, pil_image): return cls.new_native(w, h, format, rev_byteorder=False, buffer=pil_image.tobytes()) - # TODO implement from_numpy() + def get_posconv(self, page): + """ + Acquire a :class:`.PdfPosConv` coordinate translator for this bitmap and the page it was rendered from. + + This API requires passing in the page explicitly, to avoid holding a strong reference, so that bitmap and page can be freed by finalizer independently. + """ + # if the bitmap was rendered from a page, resolve weakref and check identity + if not self._pos_args or self._pos_args[0]() is not page: + raise RuntimeError("This bitmap does not belong to the given page.") + return PdfPosConv(page, self._pos_args[1:]) def _pil_convert_for_pdfium(pil_image): @@ -302,3 +312,42 @@ def _pil_convert_for_pdfium(pil_image): pil_image = PIL.Image.merge("RGBX", (b, g, r, x)) return pil_image + + +class PdfPosConv: + """ + Pdf coordinate translator. + + Hint: + You may want to use :meth:`.PdfBitmap.get_posconv` to obtain an instance of this class. + + Parameters: + page (PdfPage): + Handle to the page. + pos_args (tuple[int*5]): + pdfium canvas args (start_x, start_y, size_x, size_y, rotate), as in ``FPDF_RenderPageBitmap()`` etc. + """ + + def __init__(self, page, pos_args): + self.page = page + self.pos_args = pos_args + + def to_page(self, bitmap_x, bitmap_y): + """ + Translate coordinates from bitmap to page. + """ + page_x, page_y = ctypes.c_double(), ctypes.c_double() + ok = pdfium_c.FPDF_DeviceToPage(self.page, *self.pos_args, bitmap_x, bitmap_y, page_x, page_y) + if not ok: + raise PdfiumError("Failed to translate to page coordinates.") + return (page_x.value, page_y.value) + + def to_bitmap(self, page_x, page_y): + """ + Translate coordinates from page to bitmap. + """ + bitmap_x, bitmap_y = ctypes.c_int(), ctypes.c_int() + ok = pdfium_c.FPDF_PageToDevice(self.page, *self.pos_args, page_x, page_y, bitmap_x, bitmap_y) + if not ok: + raise PdfiumError("Failed to translate to bitmap coordinates.") + return (bitmap_x.value, bitmap_y.value) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 614131a82..14b72a76a 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2024 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -__all__ = ("PdfPage", "PdfPosConv") +__all__ = ("PdfPage", ) import math import ctypes @@ -322,16 +322,6 @@ def flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY): return rc - def get_posconv(self, bitmap): - """ - Acquire a :class:`.PdfPosConv` coordinate translator for a :class:`.PdfBitmap` rendered from this page. - """ - # if the bitmap was rendered from a page, resolve weakref and check identity - if not bitmap._pos_args or bitmap._pos_args[0]() is not self: - raise RuntimeError("The given bitmap does not belong to this page.") - return PdfPosConv(self, bitmap._pos_args[1:]) - - # TODO # - add helpers for matrix-based and interruptible rendering # - add lower-level renderer that takes a caller-provided bitmap @@ -511,43 +501,3 @@ def _parse_renderopts( # TODO consider using a namedtuple or something return cl_format, rev_byteorder, fill_color, flags - - -class PdfPosConv: - """ - Pdf coordinate translator. - - Hint: - Use :meth:`.PdfPage.get_posconv` to obtain an instance of this class. - It is not normally necessary to access the :class:`.PdfPosConv` constructor directly. - - Parameters: - page (PdfPage): - Handle to the page. - pos_args (tuple[int*5]): - pdfium canvas args (start_x, start_y, size_x, size_y, rotate), as in ``FPDF_RenderPageBitmap()`` etc. - """ - - def __init__(self, page, pos_args): - self.page = page - self.pos_args = pos_args - - def to_page(self, bitmap_x, bitmap_y): - """ - Translate coordinates from bitmap to page. - """ - page_x, page_y = ctypes.c_double(), ctypes.c_double() - ok = pdfium_c.FPDF_DeviceToPage(self.page, *self.pos_args, bitmap_x, bitmap_y, page_x, page_y) - if not ok: - raise PdfiumError("Failed to translate to page coordinates.") - return (page_x.value, page_y.value) - - def to_bitmap(self, page_x, page_y): - """ - Translate coordinates from page to bitmap. - """ - bitmap_x, bitmap_y = ctypes.c_int(), ctypes.c_int() - ok = pdfium_c.FPDF_PageToDevice(self.page, *self.pos_args, page_x, page_y, bitmap_x, bitmap_y) - if not ok: - raise PdfiumError("Failed to translate to bitmap coordinates.") - return (bitmap_x.value, bitmap_y.value) diff --git a/tests/test_rendering.py b/tests/test_rendering.py index d757d0b9a..e1ee21f82 100644 --- a/tests/test_rendering.py +++ b/tests/test_rendering.py @@ -393,7 +393,7 @@ def test_draw_image_borders(): pdf_qpl = [i.get_quad_points() for i in images] bitmap = page.render(scale=1) - posconv = page.get_posconv(bitmap) + posconv = bitmap.get_posconv(page) pil_image = bitmap.to_pil() bitmap_qpl = [[posconv.to_bitmap(x, y) for x, y in qps] for qps in pdf_qpl] From ad4ec2c56a54546b497639cd7c27237c6eb68347 Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 9 Apr 2024 18:39:48 +0200 Subject: [PATCH 037/140] Add experimental position normalizer --- docs/devel/changelog_staging.md | 1 + src/pypdfium2/_helpers/bitmap.py | 6 ++- src/pypdfium2/_helpers/page.py | 65 +++++++++++++++++++++++++++++++- 3 files changed, 68 insertions(+), 4 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 909d28b77..ce0d4c8da 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -20,6 +20,7 @@ *Improvements and new features* - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates. +- Also added `PdfPosNormalizer` and `PdfPage.get_pos_normalizer()` as a wrapper around `PdfPosConv`. - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object. - Added `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added an assertion to make sure requirements are met, and updated docs accordingly. - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index f98fe76dd..a9343a6d9 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -278,9 +278,9 @@ def from_pil(cls, pil_image): def get_posconv(self, page): """ - Acquire a :class:`.PdfPosConv` coordinate translator for this bitmap and the page it was rendered from. + Acquire a :class:`.PdfPosConv` object to translate between coordinates on the bitmap and the page it was rendered from. - This API requires passing in the page explicitly, to avoid holding a strong reference, so that bitmap and page can be freed by finalizer independently. + This method requires passing in the page explicitly, to avoid holding a strong reference, so that bitmap and page can be independently freed by finalizer. """ # if the bitmap was rendered from a page, resolve weakref and check identity if not self._pos_args or self._pos_args[0]() is not page: @@ -328,6 +328,8 @@ class PdfPosConv: pdfium canvas args (start_x, start_y, size_x, size_y, rotate), as in ``FPDF_RenderPageBitmap()`` etc. """ + # FIXME would we have to do overflow checking against too large sizes? + def __init__(self, page, pos_args): self.page = page self.pos_args = pos_args diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 14b72a76a..49e455a3c 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2024 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -__all__ = ("PdfPage", ) +__all__ = ("PdfPage", "PdfPosNormalizer") import math import ctypes @@ -10,7 +10,7 @@ import pypdfium2.raw as pdfium_c import pypdfium2.internal as pdfium_i from pypdfium2._helpers.misc import PdfiumError -from pypdfium2._helpers.bitmap import PdfBitmap +from pypdfium2._helpers.bitmap import PdfBitmap, PdfPosConv from pypdfium2._helpers.textpage import PdfTextPage from pypdfium2._helpers.pageobjects import PdfObject @@ -322,6 +322,31 @@ def flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY): return rc + def get_pos_normalizer(self, ps=5, origin="bottom_left"): + """ + Set up a coordinate normalizer object that may be used to apply PDF coordinate system transformations to values, or unapply them. + + This may be useful when writing PDF position data to a format that assumes a strict coordinate system, or to conveniently translate visual input values to raw values (e.g. swapping crop for a page with rotated/mirrored coordinate system). + + Note, as pdfium itself does not currently expose a generic coordinate normalizer, we are absusing the page <-> raster translator APIs by supplying a fictional raster of a certain scale, which is rather inelegant, as there is some back-and-forth calculation and an inherent loss of precision (though it can be made irrelevantly small), due to interjection of the raster. + + Conversely, this means you should not use this method for translating to/from an actual bitmap. Instead, use :meth:`.PdfBitmap.get_posconv`/:class:`PdfPosConv` directly, to avoid even more unnecessary calculation. + + Parameters: + ps (float): + Scale factor to use for the fictional raster. Controls the precision of normalized values. + origin (str): + The corner to use as origin (``bottom_left`` or ``top_left``). + The underlying pdfium API works with top left, but the default here is bottom left so that raw and normalized values align for a non-transformed coordinate system. + Returns: + PdfPosNormalizer + """ + w, h = self.get_size() + w, h = round(w*ps), round(h*ps) + posconv = PdfPosConv(self, (0, 0, w, h, 0)) + return PdfPosNormalizer(posconv, ps, origin) + + # TODO # - add helpers for matrix-based and interruptible rendering # - add lower-level renderer that takes a caller-provided bitmap @@ -501,3 +526,39 @@ def _parse_renderopts( # TODO consider using a namedtuple or something return cl_format, rev_byteorder, fill_color, flags + + +class PdfPosNormalizer: + """ + Pdf coordinate normalizer. + See :meth:`.PdfPage.get_pos_normalizer` for description. + """ + + def __init__(self, posconv, ps, origin): + self._posconv = posconv + self._ps = ps + if origin == "top_left": + self._translate_y = lambda y: y + elif origin == "bottom_left": + size_y = posconv.pos_args[3] + self._translate_y = lambda y: size_y - y + else: + raise ValueError(f"Origin {origin!r} is not a supported corner.") + + def to_norm(self, raw_x, raw_y): + """ + Translate raw to normalized coordinates. This applies coordinate system transformations. + """ + x, y = self._posconv.to_bitmap(raw_x, raw_y) + x = x / self._ps + y = self._translate_y(y) / self._ps + return x, y + + def to_raw(self, norm_x, norm_y): + """ + Translate normalized to raw coordinates. + This unapplies coordinate system transformations by doing the inverse transformation. + """ + x = round(norm_x * self._ps) + y = round(self._translate_y(norm_y * self._ps)) + return self._posconv.to_page(x, y) From 00a738bb28be2e3d1e588802b391a60672960f7a Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 9 Apr 2024 23:49:33 +0200 Subject: [PATCH 038/140] docs: include changelog_staging also with non-main branches The decisive aspect is not what branch we are on, but whether we are on a tagged codebase or not. --- docs/source/changelog.rst | 2 +- docs/source/conf.py | 39 ++++++---------------- docs/source/index.rst | 2 +- setupsrc/pypdfium2_setup/autorelease.py | 19 ++--------- setupsrc/pypdfium2_setup/packaging_base.py | 15 +++++++++ 5 files changed, 31 insertions(+), 46 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index b1ca25645..5c526855b 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -4,7 +4,7 @@ Changelog ========= -.. ifconfig:: build_type == 'latest' +.. ifconfig:: have_changes .. warning:: This is a documentation build for an unreleased version of pypdfium2, so it is possible that new changes are not logged yet. diff --git a/docs/source/conf.py b/docs/source/conf.py index 18a452957..b87e1db9c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -5,7 +5,6 @@ # See https://www.sphinx-doc.org/en/master/usage/configuration.html # and https://docs.readthedocs.io/en/stable/environment-variables.html -import os import sys import time import collections @@ -13,27 +12,15 @@ sys.path.insert(0, str(Path(__file__).parents[2] / "setupsrc")) from pypdfium2_setup.packaging_base import ( - run_cmd, - ProjectDir, + parse_git_tag, + get_next_changelog, ) - -def _get_build_type(): - - # RTD uses git checkout --force origin/... which results in a detached HEAD state, so we cannot easily get the branch name - # Thus query for an RTD-specific environment variable instead - rtd_vn = os.environ.get("READTHEDOCS_VERSION_NAME", None) - if rtd_vn: - return rtd_vn - - branch = run_cmd(["git", "branch", "--show-current"], cwd=ProjectDir, capture=True) - if branch == "main": - return "latest" - else: - return branch - - -build_type = _get_build_type() +# FIXME not sure if this will work on RTD +tag_info = parse_git_tag() +have_changes = tag_info["n_commits"] > 0 or tag_info["dirty"] +if get_next_changelog(): + assert have_changes project = "pypdfium2" author = "pypdfium2-team" @@ -81,14 +68,10 @@ def _get_build_type(): # https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-rst_prolog # .. |br| raw:: html - #
-rst_prolog = """ -.. |build_type| replace:: %(build_type)s -""" % dict( - build_type = build_type, -) - +rst_prolog = f""" +.. |have_changes| replace:: {have_changes} +""" def remove_namedtuple_aliases(app, what, name, obj, skip, options): if type(obj) is collections._tuplegetter: @@ -98,4 +81,4 @@ def remove_namedtuple_aliases(app, what, name, obj, skip, options): def setup(app): app.connect('autodoc-skip-member', remove_namedtuple_aliases) - app.add_config_value("build_type", "latest", "env") + app.add_config_value("have_changes", True, "env") diff --git a/docs/source/index.rst b/docs/source/index.rst index b67e35815..60d4e81ea 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -4,7 +4,7 @@ pypdfium2 ========= -Welcome to the documentation for the support model of pypdfium2 (|build_type| build). +Welcome to the documentation for the support model of pypdfium2. .. toctree:: :maxdepth: 2 diff --git a/setupsrc/pypdfium2_setup/autorelease.py b/setupsrc/pypdfium2_setup/autorelease.py index 85d94639a..520999cdc 100644 --- a/setupsrc/pypdfium2_setup/autorelease.py +++ b/setupsrc/pypdfium2_setup/autorelease.py @@ -151,21 +151,6 @@ def make_releasenotes(summary, prev_pdfium, new_pdfium, prev_tag, new_tag, c_upd (ProjectDir/"RELEASE.md").write_text(relnotes) -def get_changelog_staging(beta): - - content = ChangelogStaging.read_text() - pos = content.index("\n", content.index("# Changelog")) + 1 - header = content[:pos].strip() + "\n" - devel_msg = content[pos:].strip() - if devel_msg: - devel_msg += "\n" - - if beta is None: # flush - ChangelogStaging.write_text(header) - - return devel_msg - - def main(): parser = argparse.ArgumentParser( @@ -193,7 +178,9 @@ def main(): write_json(AR_RecordFile, dict(pdfium=new_pdfium, tag=new_tag)) update_refbindings(latest_pdfium) - summary = get_changelog_staging(new_helpers["beta"]) + summary = get_next_changelog( + flush = new_helpers["beta"] is None + ) log_changes(summary, record["pdfium"], new_pdfium, new_tag, new_helpers["beta"]) if args.register: register_changes(new_tag) diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py index a8221a3ca..099a9ac59 100644 --- a/setupsrc/pypdfium2_setup/packaging_base.py +++ b/setupsrc/pypdfium2_setup/packaging_base.py @@ -633,3 +633,18 @@ def parse_modspec(modspec): else: modnames = ModulesAll return modnames + + +def get_next_changelog(flush=False): + + content = ChangelogStaging.read_text() + pos = content.index("\n", content.index("# Changelog")) + 1 + header = content[:pos].strip() + "\n" + devel_msg = content[pos:].strip() + if devel_msg: + devel_msg += "\n" + + if flush: + ChangelogStaging.write_text(header) + + return devel_msg From 6ec5bb43073b615712720fbf25ab1d1f80980e48 Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 10 Apr 2024 00:07:01 +0200 Subject: [PATCH 039/140] XXX print out tag info --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index b87e1db9c..629f8dcaf 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,6 +18,7 @@ # FIXME not sure if this will work on RTD tag_info = parse_git_tag() +print(tag_info, file=sys.stderr) have_changes = tag_info["n_commits"] > 0 or tag_info["dirty"] if get_next_changelog(): assert have_changes From 08f05d6ad7ce728533b56a22112605bfe246c51b Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 10 Apr 2024 00:10:16 +0200 Subject: [PATCH 040/140] XXX show git status --- docs/source/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 629f8dcaf..0653d17cf 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -14,11 +14,14 @@ from pypdfium2_setup.packaging_base import ( parse_git_tag, get_next_changelog, + run_cmd, + ProjectDir, ) # FIXME not sure if this will work on RTD tag_info = parse_git_tag() print(tag_info, file=sys.stderr) +print(run_cmd(["git", "status"], cwd=ProjectDir, capture=True), file=sys.stderr) have_changes = tag_info["n_commits"] > 0 or tag_info["dirty"] if get_next_changelog(): assert have_changes From 953d45442651105798b9f72f4128480db632e658 Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 10 Apr 2024 00:18:23 +0200 Subject: [PATCH 041/140] continue on RTD --- docs/source/conf.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 0653d17cf..d309fa770 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -5,6 +5,7 @@ # See https://www.sphinx-doc.org/en/master/usage/configuration.html # and https://docs.readthedocs.io/en/stable/environment-variables.html +import os import sys import time import collections @@ -14,15 +15,13 @@ from pypdfium2_setup.packaging_base import ( parse_git_tag, get_next_changelog, - run_cmd, - ProjectDir, ) -# FIXME not sure if this will work on RTD + +# RTD modifies conf.py, so we have to ignore dirty state if on RTD +is_rtd = os.environ.get("READTHEDOCS", "").lower() == "true" tag_info = parse_git_tag() -print(tag_info, file=sys.stderr) -print(run_cmd(["git", "status"], cwd=ProjectDir, capture=True), file=sys.stderr) -have_changes = tag_info["n_commits"] > 0 or tag_info["dirty"] +have_changes = tag_info["n_commits"] > 0 or (tag_info["dirty"] and not is_rtd) if get_next_changelog(): assert have_changes From d72f49823e1e6fbdbcf7d25cd9a4cb0fd5418b4e Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 10 Apr 2024 01:37:24 +0200 Subject: [PATCH 042/140] slightly improve docs --- docs/source/conf.py | 1 - src/pypdfium2/_helpers/document.py | 5 ++--- src/pypdfium2/_helpers/page.py | 23 +++++++++++++---------- src/pypdfium2/_helpers/pageobjects.py | 2 +- src/pypdfium2/_helpers/textpage.py | 11 +++++++---- 5 files changed, 23 insertions(+), 19 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index d309fa770..1ca0449fa 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -17,7 +17,6 @@ get_next_changelog, ) - # RTD modifies conf.py, so we have to ignore dirty state if on RTD is_rtd = os.environ.get("READTHEDOCS", "").lower() == "true" tag_info = parse_git_tag() diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 136abc658..776437ea1 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -144,9 +144,8 @@ def init_forms(self, config=None): Initialize a form env, if the document has forms. If already initialized, nothing will be done. See the :attr:`formenv` attribute. - Note: - If form rendering is desired, this method should be called directly after constructing the document, - before getting any page handles (due to PDFium's API). + Attention: + If form rendering is desired, this method must be called after constructing the document, before getting any page handles. Parameters: config (FPDF_FORMFILLINFO | None): diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 49e455a3c..56ac13471 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -23,9 +23,12 @@ class PdfPage (pdfium_i.AutoCloseable): Page helper class. Attributes: - raw (FPDF_PAGE): The underlying PDFium page handle. - pdf (PdfDocument): Reference to the document this page belongs to. - formenv (PdfFormEnv|None): Formenv handle, if the parent pdf had an active formenv at the time of page retrieval. None otherwise. + raw (FPDF_PAGE): + The underlying PDFium page handle. + pdf (PdfDocument): + Reference to the document this page belongs to. + formenv (PdfFormEnv | None): + Formenv handle, if the parent pdf had an active formenv at the time of page retrieval. None otherwise. """ def __init__(self, raw, pdf, formenv): @@ -101,9 +104,9 @@ def get_mediabox(self, fallback_ok=True): (float, float, float, float) | None: The page MediaBox in PDF canvas units, consisting of four coordinates (usually x0, y0, x1, y1). If MediaBox is not defined, returns ANSI A (0, 0, 612, 792) if ``fallback_ok=True``, None otherwise. - Note: - Due to quirks in PDFium's public API, all ``get_*box()`` functions except :meth:`.get_bbox` - do not inherit from parent nodes in the page tree (as of PDFium 5418). + + .. admonition:: Known issue\n + Due to quirks in PDFium, all ``get_*box()`` functions except :meth:`.get_bbox` do not inherit from parent nodes in the page tree (as of PDFium 5418). """ # https://crbug.com/pdfium/1786 return self._get_box(pdfium_c.FPDFPage_GetMediaBox, lambda: (0, 0, 612, 792), fallback_ok) @@ -266,7 +269,7 @@ def get_objects(self, filter=None, max_depth=2, form=None, level=0): :class:`.PdfObject`: A page object. """ - # TODO? close skipped objects explicitly ? + # TODO close skipped objects explicitly ? if form: count_objects = pdfium_c.FPDFFormObj_CountObjects @@ -326,11 +329,11 @@ def get_pos_normalizer(self, ps=5, origin="bottom_left"): """ Set up a coordinate normalizer object that may be used to apply PDF coordinate system transformations to values, or unapply them. - This may be useful when writing PDF position data to a format that assumes a strict coordinate system, or to conveniently translate visual input values to raw values (e.g. swapping crop for a page with rotated/mirrored coordinate system). + This may be useful to conveniently translate visual input values to raw values (e.g. swapping crop for a page with rotated/mirrored coordinate system), or when passing position data to a receiver that assumes a strict coordinate system. Note, as pdfium itself does not currently expose a generic coordinate normalizer, we are absusing the page <-> raster translator APIs by supplying a fictional raster of a certain scale, which is rather inelegant, as there is some back-and-forth calculation and an inherent loss of precision (though it can be made irrelevantly small), due to interjection of the raster. - Conversely, this means you should not use this method for translating to/from an actual bitmap. Instead, use :meth:`.PdfBitmap.get_posconv`/:class:`PdfPosConv` directly, to avoid even more unnecessary calculation. + Conversely, this means you should not use this method for translating to/from an actual bitmap. Instead, use :meth:`.PdfBitmap.get_posconv`/:class:`.PdfPosConv` directly, to avoid even more unnecessary calculation. Parameters: ps (float): @@ -339,7 +342,7 @@ def get_pos_normalizer(self, ps=5, origin="bottom_left"): The corner to use as origin (``bottom_left`` or ``top_left``). The underlying pdfium API works with top left, but the default here is bottom left so that raw and normalized values align for a non-transformed coordinate system. Returns: - PdfPosNormalizer + PdfPosNormalizer: Position normalization helper. """ w, h = self.get_size() w, h = round(w*ps), round(h*ps) diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index ac1e6e784..d1be171e0 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -343,7 +343,7 @@ def extract(self, dest, *args, **kwargs): This method can only extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly. Otherwise, the pixel data is decoded, and re-encoded using :mod:`PIL`. - For images with simple filters only, ``get_data(decode_simple=True)`` is used for decoding to preserve higher bit depth or special color formats not supported by ``FPDF_BITMAP``. + For images with simple filters only, ``get_data(decode_simple=True)`` is used to preserve higher bit depth or special color formats not supported by ``FPDF_BITMAP``. For images with complex filters, we have to resort to :meth:`.get_bitmap`, which can be a lossy operation. Note, this method ignores alpha masks, and potentially other data stored separately of the main data stream, which might lead to incorrect representation of the image. diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py index 82ab1eb63..9d9170fbe 100644 --- a/src/pypdfium2/_helpers/textpage.py +++ b/src/pypdfium2/_helpers/textpage.py @@ -20,8 +20,10 @@ class PdfTextPage (pdfium_i.AutoCloseable): Text page helper class. Attributes: - raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle. - page (PdfPage): Reference to the page this textpage belongs to. + raw (FPDF_TEXTPAGE): + The underlying PDFium textpage handle. + page (PdfPage): + Reference to the page this textpage belongs to. """ def __init__(self, raw, page): @@ -211,8 +213,9 @@ def get_charbox(self, index, loose=False): def get_rect(self, index): """ Get the bounding box of a text rectangle at the given index. - Note that :meth:`.count_rects` must be called once with default parameters - before subsequent :meth:`.get_rect` calls for this function to work (due to PDFium's API). + + Attention: + :meth:`.count_rects` must be called once with default params before subsequent :meth:`.get_rect` calls for this function to work. Returns: Float values for left, bottom, right and top in PDF canvas units. From ea9b3ad33782e4562a097f6b6c3c82f64d7303bd Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 10 Apr 2024 18:06:01 +0200 Subject: [PATCH 043/140] Improve PdfBitmap.new_native() logic The buffer calculation is stride-agnostic, so it's fine to mix with a custom stride. Permit this and add assertions for the caller-provided cases. --- src/pypdfium2/_helpers/bitmap.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index a9343a6d9..5d93b9bdf 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -119,16 +119,25 @@ def from_raw(cls, raw, rev_byteorder=False, ex_buffer=None): def new_native(cls, width, height, format, rev_byteorder=False, buffer=None, stride=None): """ Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by Python/ctypes, or provided by the caller. - Buffers allocated by this function are packed (i.e. no unused bytes at line end). - If an external buffer is provided, stride may be set if there is a padding. + + If buffer and stride are None, a packed buffer is created. + If buffer is None but a custom stride is given, a stride-agnostic buffer is created. + If both custom buffer and stride are given, they are used as-is. + + Caller-provided buffers or strides are subject to a logical validation. """ - orig_stride = stride + bpc = pdfium_i.BitmapTypeToNChannels[format] if stride is None: - stride = width * pdfium_i.BitmapTypeToNChannels[format] + stride = width * bpc + else: + assert stride >= width * bpc + if buffer is None: - assert orig_stride is None buffer = (ctypes.c_ubyte * (stride * height))() + else: + assert len(buffer) >= stride * height + raw = pdfium_c.FPDFBitmap_CreateEx(width, height, format, buffer, stride) # alternatively, we could call the constructor directly with the information from above From 0d9e47843dcd54e7cdc8fae780a13210d1791652 Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 10 Apr 2024 23:21:21 +0200 Subject: [PATCH 044/140] Warn about pos normalizer having to be re-created --- src/pypdfium2/_helpers/page.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 56ac13471..4d20283d5 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -335,6 +335,9 @@ def get_pos_normalizer(self, ps=5, origin="bottom_left"): Conversely, this means you should not use this method for translating to/from an actual bitmap. Instead, use :meth:`.PdfBitmap.get_posconv`/:class:`.PdfPosConv` directly, to avoid even more unnecessary calculation. + Attention: + Whenever modifications to page geometry were made, the object has to be re-created to update the underlying fictional raster. + Parameters: ps (float): Scale factor to use for the fictional raster. Controls the precision of normalized values. From 8532ce63d32a4eadb55aeaa2c584609c3b87fd52 Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 10 Apr 2024 23:27:23 +0200 Subject: [PATCH 045/140] bases: style nits --- src/pypdfium2/internal/bases.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py index e1a70dcdb..51ae38fcb 100644 --- a/src/pypdfium2/internal/bases.py +++ b/src/pypdfium2/internal/bases.py @@ -26,8 +26,7 @@ class AutoCastable: @property def _as_parameter_(self): - # TODO tighten to `not isinstance(...)` (needs declaraction of C type) - if not self.raw: + if self.raw is None: raise RuntimeError("Cannot use closed object as C function parameter.") return self.raw @@ -43,7 +42,7 @@ def _close_template(close_func, raw, obj_repr, state, parent, *args, **kwargs): os.write(sys.stderr.fileno(), f"-> Cannot close object, library is destroyed. This may cause a memory leak!\n".encode()) return - assert (parent is None) or not parent._tree_closed() + assert parent is None or not parent._tree_closed() close_func(raw, *args, **kwargs) @@ -51,7 +50,7 @@ class AutoCloseable (AutoCastable): def __init__(self, close_func, *args, obj=None, needs_free=True, **kwargs): - # NOTE proactively prevent accidental double initialization + # proactively prevent accidental double initialization assert not hasattr(self, "_finalizer") self._close_func = close_func @@ -72,7 +71,7 @@ def __repr__(self): def _attach_finalizer(self): - # NOTE this function captures the value of the `parent` property at finalizer installation time - if it changes, detach the old finalizer and create a new one + # NOTE this function captures the value of the `parent` property at finalizer installation time assert self._finalizer is None self._finalizer = weakref.finalize(self._obj, _close_template, self._close_func, self.raw, repr(self), self._autoclose_state, self.parent, *self._ex_args, **self._ex_kwargs) From 30afc7969fad34ff7859ac3646881672b4d92532 Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 10 Apr 2024 23:36:57 +0200 Subject: [PATCH 046/140] Remove PdfPosNormalizer experiment The fact that the object has to be re-created in response to any change in page geometry renders it basically unusable IMO. The whole approach was bad practice anyway. The correct way to do this would be to patch pdfium with new APIs that don't do the raster rounding and origin translation. FWIW, I made a private copy of the code. --- docs/devel/changelog_staging.md | 1 - src/pypdfium2/_helpers/page.py | 68 +-------------------------------- 2 files changed, 2 insertions(+), 67 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index ce0d4c8da..909d28b77 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -20,7 +20,6 @@ *Improvements and new features* - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates. -- Also added `PdfPosNormalizer` and `PdfPage.get_pos_normalizer()` as a wrapper around `PdfPosConv`. - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object. - Added `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added an assertion to make sure requirements are met, and updated docs accordingly. - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 4d20283d5..81bcd5420 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2024 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -__all__ = ("PdfPage", "PdfPosNormalizer") +__all__ = ("PdfPage") import math import ctypes @@ -10,7 +10,7 @@ import pypdfium2.raw as pdfium_c import pypdfium2.internal as pdfium_i from pypdfium2._helpers.misc import PdfiumError -from pypdfium2._helpers.bitmap import PdfBitmap, PdfPosConv +from pypdfium2._helpers.bitmap import PdfBitmap from pypdfium2._helpers.textpage import PdfTextPage from pypdfium2._helpers.pageobjects import PdfObject @@ -325,34 +325,6 @@ def flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY): return rc - def get_pos_normalizer(self, ps=5, origin="bottom_left"): - """ - Set up a coordinate normalizer object that may be used to apply PDF coordinate system transformations to values, or unapply them. - - This may be useful to conveniently translate visual input values to raw values (e.g. swapping crop for a page with rotated/mirrored coordinate system), or when passing position data to a receiver that assumes a strict coordinate system. - - Note, as pdfium itself does not currently expose a generic coordinate normalizer, we are absusing the page <-> raster translator APIs by supplying a fictional raster of a certain scale, which is rather inelegant, as there is some back-and-forth calculation and an inherent loss of precision (though it can be made irrelevantly small), due to interjection of the raster. - - Conversely, this means you should not use this method for translating to/from an actual bitmap. Instead, use :meth:`.PdfBitmap.get_posconv`/:class:`.PdfPosConv` directly, to avoid even more unnecessary calculation. - - Attention: - Whenever modifications to page geometry were made, the object has to be re-created to update the underlying fictional raster. - - Parameters: - ps (float): - Scale factor to use for the fictional raster. Controls the precision of normalized values. - origin (str): - The corner to use as origin (``bottom_left`` or ``top_left``). - The underlying pdfium API works with top left, but the default here is bottom left so that raw and normalized values align for a non-transformed coordinate system. - Returns: - PdfPosNormalizer: Position normalization helper. - """ - w, h = self.get_size() - w, h = round(w*ps), round(h*ps) - posconv = PdfPosConv(self, (0, 0, w, h, 0)) - return PdfPosNormalizer(posconv, ps, origin) - - # TODO # - add helpers for matrix-based and interruptible rendering # - add lower-level renderer that takes a caller-provided bitmap @@ -532,39 +504,3 @@ def _parse_renderopts( # TODO consider using a namedtuple or something return cl_format, rev_byteorder, fill_color, flags - - -class PdfPosNormalizer: - """ - Pdf coordinate normalizer. - See :meth:`.PdfPage.get_pos_normalizer` for description. - """ - - def __init__(self, posconv, ps, origin): - self._posconv = posconv - self._ps = ps - if origin == "top_left": - self._translate_y = lambda y: y - elif origin == "bottom_left": - size_y = posconv.pos_args[3] - self._translate_y = lambda y: size_y - y - else: - raise ValueError(f"Origin {origin!r} is not a supported corner.") - - def to_norm(self, raw_x, raw_y): - """ - Translate raw to normalized coordinates. This applies coordinate system transformations. - """ - x, y = self._posconv.to_bitmap(raw_x, raw_y) - x = x / self._ps - y = self._translate_y(y) / self._ps - return x, y - - def to_raw(self, norm_x, norm_y): - """ - Translate normalized to raw coordinates. - This unapplies coordinate system transformations by doing the inverse transformation. - """ - x = round(norm_x * self._ps) - y = round(self._translate_y(norm_y * self._ps)) - return self._posconv.to_page(x, y) From 1fcbfd81667df834522ede507e8099c9b1932aaa Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 11 Apr 2024 19:06:20 +0200 Subject: [PATCH 047/140] fix `__all__` blunder ("...") reduces to "...", which is also iterable (but quite wrong), leading to `AttributeError: module 'pypdfium2._helpers.page' has no attribute 'P'` Perhaps it would be smarter to use lists rather than tuples anyway. --- src/pypdfium2/_helpers/page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 81bcd5420..4f3906fef 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2024 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -__all__ = ("PdfPage") +__all__ = ("PdfPage", ) import math import ctypes From 4fe9d0d9c2e4e1a6809563e0f0415f8fd554aafb Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 11 Apr 2024 19:12:10 +0200 Subject: [PATCH 048/140] get_count(): fix doc blunder It is supposed to be the number of direct children only (which, incidentally, also makes more sense). Not sure how I got it into my mind this would be recursive. --- src/pypdfium2/_helpers/document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 776437ea1..87ae513da 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -646,7 +646,7 @@ def get_title(self): def get_count(self): """ Returns: - int: Signed number of child bookmarks, recursively counting all members in the subtree. Zero if the bookmark has no descendants. + int: Signed number of direct child bookmarks (i.e. non-recursive). Zero if the bookmark has no descendants. The initial state shall be closed (collapsed) if negative, open (expanded) if positive. """ return pdfium_c.FPDFBookmark_GetCount(self) From ccc0b1804e5505e3a807b30406ef4c5f8021dea1 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 11 Apr 2024 19:15:46 +0200 Subject: [PATCH 049/140] move up helper function --- src/pypdfium2/_helpers/document.py | 52 +++++++++++++++--------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 87ae513da..27d94fbee 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -529,6 +529,32 @@ def get_toc( bm_ptr = pdfium_c.FPDFBookmark_GetNextSibling(self, bm_ptr) +def _open_pdf(input_data, password, autoclose): + + to_hold, to_close = (), () + if password is not None: + password = (password+"\x00").encode("utf-8") + + if isinstance(input_data, Path): + pdf = pdfium_c.FPDF_LoadDocument((str(input_data)+"\x00").encode("utf-8"), password) + elif isinstance(input_data, (bytes, ctypes.Array)): + pdf = pdfium_c.FPDF_LoadMemDocument64(input_data, len(input_data), password) + to_hold = (input_data, ) + elif pdfium_i.is_buffer(input_data, "r"): + bufaccess, to_hold = pdfium_i.get_bufreader(input_data) + if autoclose: + to_close = (input_data, ) + pdf = pdfium_c.FPDF_LoadCustomDocument(bufaccess, password) + else: + raise TypeError(f"Invalid input type '{type(input_data).__name__}'") + + if pdfium_c.FPDF_GetPageCount(pdf) < 1: + err_code = pdfium_c.FPDF_GetLastError() + raise PdfiumError(f"Failed to load document (PDFium: {pdfium_i.ErrorToStr.get(err_code)}).") + + return pdf, to_hold, to_close + + class PdfFormEnv (pdfium_i.AutoCloseable): """ Form environment helper class. @@ -589,32 +615,6 @@ def as_pageobject(self): return PdfObject(raw=raw_pageobj, pdf=self.pdf) -def _open_pdf(input_data, password, autoclose): - - to_hold, to_close = (), () - if password is not None: - password = (password+"\x00").encode("utf-8") - - if isinstance(input_data, Path): - pdf = pdfium_c.FPDF_LoadDocument((str(input_data)+"\x00").encode("utf-8"), password) - elif isinstance(input_data, (bytes, ctypes.Array)): - pdf = pdfium_c.FPDF_LoadMemDocument64(input_data, len(input_data), password) - to_hold = (input_data, ) - elif pdfium_i.is_buffer(input_data, "r"): - bufaccess, to_hold = pdfium_i.get_bufreader(input_data) - if autoclose: - to_close = (input_data, ) - pdf = pdfium_c.FPDF_LoadCustomDocument(bufaccess, password) - else: - raise TypeError(f"Invalid input type '{type(input_data).__name__}'") - - if pdfium_c.FPDF_GetPageCount(pdf) < 1: - err_code = pdfium_c.FPDF_GetLastError() - raise PdfiumError(f"Failed to load document (PDFium: {pdfium_i.ErrorToStr.get(err_code)}).") - - return pdf, to_hold, to_close - - class PdfBookmark (pdfium_i.AutoCastable): """ Bookmark helper class. From bf3e6164eb1b113561ca9b062c3f3b85372714ec Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 11 Apr 2024 19:17:33 +0200 Subject: [PATCH 050/140] update changelog --- docs/devel/changelog_staging.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 909d28b77..c5bb8a9bd 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -28,6 +28,7 @@ *Project* - Merged `tests_old/` back into `tests/`. +- Docs: Improved logic when to include the unreleased version warning and upcoming changelog. - - - - -# Planned Changes - -To find out about possible planned changes, you can ... -* Search the codebase for `TODO(apibreak)`. -* Check if there is a development branch. If so, take a look at its changelog (`docs/devel/changelog_staging.md`). From d577349cf99f48cf4da11a9711444654e20dbd16 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 5 May 2024 19:22:27 +0200 Subject: [PATCH 060/140] minor readme improvements --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index caeeecfc2..6a953820e 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct PDFIUM_PLATFORM="sourcebuild" python -m pip install -v . ``` Building PDFium may take a long time, as it comes with its bundled toolchain and deps, rather than taking them from the system.[^pdfium_buildsystem] - However, we can at least provide the `--use-syslibs` option to build against system-provided runtime libraries. + However, we can at least provide the `--use-syslibs` option to build against system runtime libraries. * With system-provided binary 🔗 ```bash @@ -98,14 +98,14 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct See [Setup Magic](#setup-magic) for details. - Support for source installs (esp. with self-built/system pdfium) is limited, as their integrity depends somewhat on a correctly acting caller. + Support for source installs (esp. with self-built/system pdfium) is limited, as their integrity somewhat depends on a correctly acting caller. - Installing an `sdist` does not implicitly trigger a sourcebuild if no pre-built binary is available. It is preferred to let callers decide consciously what to do, and run the build script without pip encapsulation. + Installing an `sdist` does not implicitly trigger a sourcebuild if no pre-built binary is available. We prefer to let callers decide consciously what to do, and run the build script without pip encapsulation. Relevant pip options: * `-v`: Verbose logging output. Useful for debugging. * `-e`: Install in editable mode, so the installation points to the source tree. This way, changes directly take effect without needing to re-install. Recommended for development. - * `--no-build-isolation`: Do not isolate setup in a virtual env; use the main env instead. This renders `pyproject.toml [build-system]` inactive, setup deps must be prepared by caller. Useful to install custom versions of setup deps, or as speedup when installing repeatedly. + * `--no-build-isolation`: Do not isolate setup in a virtual env; use the main env instead. This renders `pyproject.toml [build-system]` inactive, so setup deps must be prepared by caller. Useful to install custom versions of setup deps, or as speedup when installing repeatedly. [^pdfium_buildsystem]: This means pdfium may not compile on arbitrary hosts. The script is limited to build hosts supported by Google's toolchain. Ideally, we'd need an alternative build system that runs with system packages instead. @@ -129,7 +129,8 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct conda install pypdfium2-team::pypdfium2_helpers --override-channels -c pypdfium2-team -c bblanchon ``` - Adding the channels permanently and tightening priority is encouraged to include pypdfium2 in `conda update` by default, and to avoid accidentally replacing the install with a different channel. (If desired, you may limit the channel config to the current environment by adding `--env`.) + If desired, you may limit the channel config to the current environment by adding `--env`. + Adding the channels permanently and tightening priority is encouraged to include pypdfium2 in `conda update` by default, and to avoid accidentally replacing the install with a different channel. Otherwise, you should be cautious when making changes to the environment. + To depend on pypdfium2 in a `conda-build` recipe From 640e80a9efe287f857e4ef9964aa2cc3d0218d47 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 5 May 2024 23:38:50 +0200 Subject: [PATCH 061/140] CLI/arrange: rm pointless var, better release implicit fh --- src/pypdfium2/_cli/arrange.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/pypdfium2/_cli/arrange.py b/src/pypdfium2/_cli/arrange.py index ec7108447..5e2b050b3 100644 --- a/src/pypdfium2/_cli/arrange.py +++ b/src/pypdfium2/_cli/arrange.py @@ -41,11 +41,9 @@ def main(args): args.passwords.append(None) dest_pdf = pdfium.PdfDocument.new() - index = 0 for in_path, pages, password in zip(args.inputs, args.pages, args.passwords): - src_pdf = pdfium.PdfDocument(in_path, password=password) - dest_pdf.import_pages(src_pdf, pages=pages) - index += len(src_pdf) + with pdfium.PdfDocument(in_path, password=password) as src_pdf: + dest_pdf.import_pages(src_pdf, pages=pages) dest_pdf.save(args.output) From 8437cfd22ab2b9c807fba252ef7bbd0579770060 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 5 May 2024 23:46:10 +0200 Subject: [PATCH 062/140] CLI: clean up some comments --- src/pypdfium2/_cli/arrange.py | 1 - src/pypdfium2/_cli/attachments.py | 1 - src/pypdfium2/_cli/extract_images.py | 1 - src/pypdfium2/_cli/extract_text.py | 1 - src/pypdfium2/_cli/pdfinfo.py | 1 - src/pypdfium2/_cli/render.py | 6 +----- src/pypdfium2/_cli/tile.py | 1 - src/pypdfium2/_cli/toc.py | 1 - 8 files changed, 1 insertion(+), 12 deletions(-) diff --git a/src/pypdfium2/_cli/arrange.py b/src/pypdfium2/_cli/arrange.py index 5e2b050b3..9261a3006 100644 --- a/src/pypdfium2/_cli/arrange.py +++ b/src/pypdfium2/_cli/arrange.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause import pypdfium2._helpers as pdfium -# TODO? consider dotted access from pypdfium2._cli._parsers import parse_numtext diff --git a/src/pypdfium2/_cli/attachments.py b/src/pypdfium2/_cli/attachments.py index 039c58cd4..1536aa615 100644 --- a/src/pypdfium2/_cli/attachments.py +++ b/src/pypdfium2/_cli/attachments.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause from pathlib import Path -# TODO? consider dotted access from pypdfium2._cli._parsers import ( add_input, get_input, parse_numtext, diff --git a/src/pypdfium2/_cli/extract_images.py b/src/pypdfium2/_cli/extract_images.py index 6091d3489..87e5aeaef 100644 --- a/src/pypdfium2/_cli/extract_images.py +++ b/src/pypdfium2/_cli/extract_images.py @@ -7,7 +7,6 @@ from pathlib import Path import pypdfium2.raw as pdfium_c import pypdfium2._helpers as pdfium -# TODO? consider dotted access from pypdfium2._cli._parsers import add_input, get_input diff --git a/src/pypdfium2/_cli/extract_text.py b/src/pypdfium2/_cli/extract_text.py index 738645660..360e69897 100644 --- a/src/pypdfium2/_cli/extract_text.py +++ b/src/pypdfium2/_cli/extract_text.py @@ -1,7 +1,6 @@ # SPDX-FileCopyrightText: 2024 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -# TODO? consider dotted access from pypdfium2._cli._parsers import add_input, get_input EXTRACT_RANGE = "range" diff --git a/src/pypdfium2/_cli/pdfinfo.py b/src/pypdfium2/_cli/pdfinfo.py index f4daffc03..f8dbd0011 100644 --- a/src/pypdfium2/_cli/pdfinfo.py +++ b/src/pypdfium2/_cli/pdfinfo.py @@ -3,7 +3,6 @@ import pypdfium2.raw as pdfium_c import pypdfium2.internal as pdfium_i -# TODO? consider dotted access from pypdfium2._cli._parsers import ( add_input, add_n_digits, diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index 7d3977f64..2236d346a 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -17,7 +17,6 @@ import pypdfium2._helpers as pdfium import pypdfium2.internal as pdfium_i import pypdfium2.raw as pdfium_r -# TODO? consider dotted access from pypdfium2._cli._parsers import ( add_input, get_input, setup_logging, @@ -240,13 +239,11 @@ def _render_parallel_job(i): global ProcObjs; _render_job(i, *ProcObjs) +# TODO turn into a python-usable API yielding output paths as they are written def main(args): - # TODO turn into a python-usable API yielding output paths as they are written - pdf = get_input(args, init_forms=args.draw_forms) - # TODO move to parsers? pdf_len = len(pdf) if not all(0 <= i < pdf_len for i in args.pages): raise ValueError("Out-of-bounds page indices are prohibited.") @@ -309,7 +306,6 @@ def main(args): logger.info("Parallel rendering ...") ctx = mp.get_context(args.parallel_strategy) - # TODO unify using mp.pool.Pool(context=...) ? pool_backends = dict( mp = (ctx.Pool, "imap"), ft = (functools.partial(ft.ProcessPoolExecutor, mp_context=ctx), "map"), diff --git a/src/pypdfium2/_cli/tile.py b/src/pypdfium2/_cli/tile.py index 82d2c1e67..975648160 100644 --- a/src/pypdfium2/_cli/tile.py +++ b/src/pypdfium2/_cli/tile.py @@ -5,7 +5,6 @@ from pathlib import Path import pypdfium2.raw as pdfium_c import pypdfium2._helpers as pdfium -# TODO? consider dotted access from pypdfium2._cli._parsers import add_input, get_input diff --git a/src/pypdfium2/_cli/toc.py b/src/pypdfium2/_cli/toc.py index 5425a33ea..79511cc85 100644 --- a/src/pypdfium2/_cli/toc.py +++ b/src/pypdfium2/_cli/toc.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause import pypdfium2.internal as pdfium_i -# TODO? consider dotted access from pypdfium2._cli._parsers import ( add_input, add_n_digits, From 87a65479a3ae3d9cc4fe7c3e62152a32d28483b3 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 6 May 2024 16:06:01 +0200 Subject: [PATCH 063/140] Prepare for future release --- .github/workflows/trigger_conda_raw.yaml | 7 +++---- .github/workflows/trigger_main.yaml | 11 +++++------ autorelease/config.json | 6 +++--- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/.github/workflows/trigger_conda_raw.yaml b/.github/workflows/trigger_conda_raw.yaml index e1484294b..b8e192ba9 100644 --- a/.github/workflows/trigger_conda_raw.yaml +++ b/.github/workflows/trigger_conda_raw.yaml @@ -3,10 +3,9 @@ name: Trigger conda_raw release on: - # NOTE temporarily commented out, awaiting merge of the v5 branch - # schedule: - # # pdfium-binaries triggers conda on the first Monday of month at 4 o'clock UTC, so we'll want to rebuild after that, but before the next main release where we want to use the package - # - cron: '0 4 8 * *' # monthly, 8th day + schedule: + # pdfium-binaries triggers conda on the first Monday of month at 4 o'clock UTC, so we'll want to rebuild after that, but before the next main release where we want to use the package + - cron: '0 4 8 * *' # monthly, 8th day workflow_dispatch: jobs: diff --git a/.github/workflows/trigger_main.yaml b/.github/workflows/trigger_main.yaml index 01c1b755e..63cf7f499 100644 --- a/.github/workflows/trigger_main.yaml +++ b/.github/workflows/trigger_main.yaml @@ -5,12 +5,11 @@ name: Trigger main release on: - # NOTE temporarily commented out, awaiting merge of the v5 branch - # # https://github.com/bblanchon/pdfium-binaries/blob/master/.github/workflows/trigger.yml - # # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule - # # https://crontab.guru/ - # schedule: - # - cron: '0 4 10 * *' # monthly, 10th day + # https://github.com/bblanchon/pdfium-binaries/blob/master/.github/workflows/trigger.yml + # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule + # https://crontab.guru/ + schedule: + - cron: '0 4 10 * *' # monthly, 10th day workflow_dispatch: jobs: diff --git a/autorelease/config.json b/autorelease/config.json index 7f0e24f9f..7e2da1c10 100644 --- a/autorelease/config.json +++ b/autorelease/config.json @@ -1,4 +1,4 @@ { - "beta": false, - "major": false -} \ No newline at end of file + "beta": true, + "major": true +} From 1886f89b0ce6c8bbd465abd92ea14cedbed244f0 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 6 May 2024 16:30:22 +0200 Subject: [PATCH 064/140] retain get_text_range() check for now also move up get_text_bounded for docs --- docs/devel/changelog_staging.md | 4 +- src/pypdfium2/_helpers/textpage.py | 71 +++++++++++++++--------------- 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 2c2600df2..88aff4c62 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -5,7 +5,7 @@ # Changelog for next release -*API-breaking changes* +*API changes* - Rendering / Bitmap * Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). Instead, use `PdfPage.render()` with a loop or process pool. * Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`. @@ -16,6 +16,7 @@ * Renamed `PdfImage.get_size()` to `.get_px_size()`. * `PdfImage.extract()`: Removed `fb_render` param because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place. - `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest. +- `get_text_range()`: Restored pre-v4.28 behavior, as pdfium reverted `FPDFText_GetText()` to UCS-2. Removed implicit translation of default calls to `get_text_bounded()`. However, the latter should be preferred due to full Unicode support. - Removed legacy version flags. *Improvements and new features* @@ -24,7 +25,6 @@ - Exposed `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added check and updated docs accordingly. - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. - If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype. -- Restored `get_text_range()` to its pre-v4.28 behavior, as pdfium reverted `FPDFText_GetText()` to UCS-2. - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`. - Simplified version impl (no API change expected). diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py index 6b67f0c3e..d7eb53779 100644 --- a/src/pypdfium2/_helpers/textpage.py +++ b/src/pypdfium2/_helpers/textpage.py @@ -36,6 +36,38 @@ def parent(self): # AutoCloseable hook return self.page + def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore"): + """ + Extract text from given boundaries in PDF coordinates. + If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`. + + Parameters: + errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`). + Returns: + str: The text on the page area in question, or an empty string if no text was found. + """ + + bbox = self.page.get_bbox() + if left is None: + left = bbox[0] + if bottom is None: + bottom = bbox[1] + if right is None: + right = bbox[2] + if top is None: + top = bbox[3] + + args = (self, left, top, right, bottom) + n_chars = pdfium_c.FPDFText_GetBoundedText(*args, None, 0) + if n_chars <= 0: + return "" + + buffer = ctypes.create_string_buffer(n_chars * 2) + buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort)) + pdfium_c.FPDFText_GetBoundedText(*args, buffer_ptr, n_chars) + return buffer.raw.decode("utf-16-le", errors=errors) + + def _get_active_text_range(self, c_start, c_end, l_passive=0, r_passive=0): if c_start > c_end: @@ -56,6 +88,9 @@ def get_text_range(self, index=0, count=-1, errors="ignore"): """ Extract text from a given range. + Warning: + This method is limited to UCS-2, whereas :meth:`.get_text_bounded` provides full Unicode support. + Parameters: index (int): Index of the first char to include. count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*. @@ -86,11 +121,9 @@ def get_text_range(self, index=0, count=-1, errors="ignore"): count -= l_passive + r_passive in_count = t_end+1 - t_start - # pdfium fea01fa9e2 to d6a4b27d80 requires assuming 4 bytes per character - # this corresponds to approx. >6167,<6415 or pdfium-binaries >=6191,<=6406 + # pdfium builds from fea01fa9e2 (>6167) to d6a4b27d80 (<6415) require assuming 4 bytes per character # https://github.com/pypdfium2-team/pypdfium2/issues/298 # https://crbug.com/pdfium/2133 - # -> NOTE(geisserml) may be removed once pdfium-binaries > d6a4b27d80 is released if 6167 < PDFIUM_INFO.build < 6415: in_count *= 2 in_count += 1 # null terminator @@ -103,38 +136,6 @@ def get_text_range(self, index=0, count=-1, errors="ignore"): return buffer.raw[:(out_count-1)*2].decode("utf-16-le", errors=errors) - def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore"): - """ - Extract text from given boundaries in PDF coordinates. - If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`. - - Parameters: - errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`). - Returns: - str: The text on the page area in question, or an empty string if no text was found. - """ - - bbox = self.page.get_bbox() - if left is None: - left = bbox[0] - if bottom is None: - bottom = bbox[1] - if right is None: - right = bbox[2] - if top is None: - top = bbox[3] - - args = (self, left, top, right, bottom) - n_chars = pdfium_c.FPDFText_GetBoundedText(*args, None, 0) - if n_chars <= 0: - return "" - - buffer = ctypes.create_string_buffer(n_chars * 2) - buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort)) - pdfium_c.FPDFText_GetBoundedText(*args, buffer_ptr, n_chars) - return buffer.raw.decode("utf-16-le", errors=errors) - - def count_chars(self): """ Returns: From 027b909f7792d61534bfb28b0e4365921af10376 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 6 May 2024 16:36:32 +0200 Subject: [PATCH 065/140] round off docs for `PdfBitmap.new_native()` --- src/pypdfium2/_helpers/bitmap.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index 43f297dde..df67c0599 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -120,11 +120,12 @@ def new_native(cls, width, height, format, rev_byteorder=False, buffer=None, str """ Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by Python/ctypes, or provided by the caller. - If buffer and stride are None, a packed buffer is created. - If buffer is None but a custom stride is given, a stride-agnostic buffer is created. - If both custom buffer and stride are given, they are used as-is. + * If buffer and stride are None, a packed buffer is created. + * If a custom buffer is given but no stride, the buffer is assumed to be packed. + * If a custom stride is given but no buffer, a stride-agnostic buffer is created. + * If both custom buffer and stride are given, they are used as-is. - Caller-provided buffers or strides are subject to a logical validation. + Caller-provided buffer/stride are subject to a logical validation. """ bpc = pdfium_i.BitmapTypeToNChannels[format] From 2f135e6cd5deb81f73e92806ecefa86cc3007eec Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 7 May 2024 00:20:53 +0200 Subject: [PATCH 066/140] PdfImage.extract(): fix for filenames containing non-extension dot --- src/pypdfium2/_helpers/pageobjects.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index d1be171e0..d7c6162f2 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -360,10 +360,10 @@ def extract(self, dest, *args, **kwargs): extraction_gen = _extract_smart(self, *args, **kwargs) format = next(extraction_gen) - if isinstance(dest, str): - dest = Path(dest) if isinstance(dest, Path): - with open(dest.with_suffix("."+format), "wb") as buf: + dest = str(dest) + if isinstance(dest, str): + with open(f"{dest}.{format}", "wb") as buf: extraction_gen.send(buf) elif pdfium_i.is_buffer(dest, "w"): extraction_gen.send(dest) From cdc0c06926f8361285be75029151f4d3fea95745 Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 7 May 2024 00:22:05 +0200 Subject: [PATCH 067/140] CLI/extract-images: increase default recursion depth Increase XObject recursion depth to 15 to be on the safe side of capturing all images. --- src/pypdfium2/_cli/extract_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/_cli/extract_images.py b/src/pypdfium2/_cli/extract_images.py index 87e5aeaef..3a21b6df4 100644 --- a/src/pypdfium2/_cli/extract_images.py +++ b/src/pypdfium2/_cli/extract_images.py @@ -21,7 +21,7 @@ def attach(parser): parser.add_argument( "--max-depth", type = int, - default = 2, + default = 15, help = "Maximum recursion depth to consider when looking for page objects.", ) parser.add_argument( From 0f0dfb190f813b5837e2f2ba360d1858a8a47e4e Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 7 May 2024 00:28:59 +0200 Subject: [PATCH 068/140] update changelog --- docs/devel/changelog_staging.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 88aff4c62..e8ab03667 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -19,6 +19,10 @@ - `get_text_range()`: Restored pre-v4.28 behavior, as pdfium reverted `FPDFText_GetText()` to UCS-2. Removed implicit translation of default calls to `get_text_bounded()`. However, the latter should be preferred due to full Unicode support. - Removed legacy version flags. +*Bug fixes* +- Fixed blunder in `PdfImage.extract()` producing an incorrect output path for prefixes containing a dot. In the `extract-images` CLI, this caused all output images of a type to be written to the same path for a document containing a non-extension dot in the filename. +- XFA / rendering CLI: Fixed incorrect recognition of document length. `pdf.init_forms()` must be called before `len(pdf)`. + *Improvements and new features* - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates. - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object. @@ -28,9 +32,6 @@ - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`. - Simplified version impl (no API change expected). -*Bug fixes* -- XFA / rendering CLI: Fixed incorrect recognition of document length. `pdf.init_forms()` must be called before `len(pdf)`. - *Project* - Merged `tests_old/` back into `tests/`. - Docs: Improved logic when to include the unreleased version warning and upcoming changelog. From 37bde6447121f853653bd94e6fea9e64325fe3f6 Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 7 May 2024 00:20:53 +0200 Subject: [PATCH 069/140] PdfImage.extract(): fix for filenames containing non-extension dot Cherry-picked from devel_new --- src/pypdfium2/_helpers/pageobjects.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 2be708f1a..7941e6e84 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -334,10 +334,10 @@ def extract(self, dest, *args, **kwargs): extraction_gen = _extract_smart(self, *args, **kwargs) format = next(extraction_gen) - if isinstance(dest, str): - dest = Path(dest) if isinstance(dest, Path): - with open(dest.with_suffix("."+format), "wb") as buf: + dest = str(dest) + if isinstance(dest, str): + with open(f"{dest}.{format}", "wb") as buf: extraction_gen.send(buf) elif pdfium_i.is_buffer(dest, "w"): extraction_gen.send(dest) From 863d85dd730612e34bbfeccd7212b0dc3ee30a03 Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 7 May 2024 00:37:47 +0200 Subject: [PATCH 070/140] get_text_range(): adapt allocation to pdfium version backported from devel_new branch --- src/pypdfium2/_helpers/textpage.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py index b1303fecd..9ba875f73 100644 --- a/src/pypdfium2/_helpers/textpage.py +++ b/src/pypdfium2/_helpers/textpage.py @@ -9,6 +9,7 @@ import pypdfium2.raw as pdfium_c import pypdfium2.internal as pdfium_i from pypdfium2._helpers.misc import PdfiumError +from pypdfium2.version import PDFIUM_INFO c_double = ctypes.c_double @@ -94,7 +95,14 @@ def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False): t_start, t_end, l_passive, r_passive = active_range index += l_passive count -= l_passive + r_passive - in_count = (t_end+1 - t_start)*2 + 1 + in_count = t_end+1 - t_start + + # pdfium fea01fa9e2 (>6167) to d6a4b27d80 (<6415) requires assuming 4 bytes per character + # https://github.com/pypdfium2-team/pypdfium2/issues/298 + # https://crbug.com/pdfium/2133 + if 6167 < PDFIUM_INFO.build < 6415: + in_count *= 2 + in_count += 1 # null terminator buffer = ctypes.create_string_buffer(in_count * 2) buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort)) From 555ba5e3299265526884993941ddbd5cb7f22957 Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 7 May 2024 16:08:01 +0200 Subject: [PATCH 071/140] PdfImage.extract(): slightly simplify path handling str and path can be embedded in an f-string equally, so do it in one clause --- src/pypdfium2/_helpers/pageobjects.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index d7c6162f2..e89b7ca17 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -360,9 +360,7 @@ def extract(self, dest, *args, **kwargs): extraction_gen = _extract_smart(self, *args, **kwargs) format = next(extraction_gen) - if isinstance(dest, Path): - dest = str(dest) - if isinstance(dest, str): + if isinstance(dest, (str, Path)): with open(f"{dest}.{format}", "wb") as buf: extraction_gen.send(buf) elif pdfium_i.is_buffer(dest, "w"): From c9115bd0ef378a932b154a53c047196da99eed0e Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 7 May 2024 16:24:13 +0200 Subject: [PATCH 072/140] slightly simplify get_filters(skip_simple=True) expunge simple filters afterwards to avoid re-checking skip_simple --- src/pypdfium2/_helpers/pageobjects.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index e89b7ca17..ae51a6204 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -330,10 +330,11 @@ def get_filters(self, skip_simple=False): buffer = ctypes.create_string_buffer(length) pdfium_c.FPDFImageObj_GetImageFilter(self, i, buffer, length) f = buffer.value.decode("utf-8") - if skip_simple and f in self.SIMPLE_FILTERS: - continue filters.append(f) + if skip_simple: + filters = [f for f in filters if f not in self.SIMPLE_FILTERS] + return filters From 247873feefe1e5a4c000f93a3211a03581463a53 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 9 May 2024 19:13:58 +0200 Subject: [PATCH 073/140] Update changelog according to backports --- docs/devel/changelog_staging.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index e8ab03667..090507e1f 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -16,13 +16,9 @@ * Renamed `PdfImage.get_size()` to `.get_px_size()`. * `PdfImage.extract()`: Removed `fb_render` param because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place. - `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest. -- `get_text_range()`: Restored pre-v4.28 behavior, as pdfium reverted `FPDFText_GetText()` to UCS-2. Removed implicit translation of default calls to `get_text_bounded()`. However, the latter should be preferred due to full Unicode support. +- `get_text_range()`: Removed implicit translation of default calls to `get_text_bounded()`, as pdfium reverted `FPDFText_GetText()` to UCS-2, which resolves the allocation concern. However, callers are encouraged to explicitly use `get_text_bounded()` for full Unicode support. - Removed legacy version flags. -*Bug fixes* -- Fixed blunder in `PdfImage.extract()` producing an incorrect output path for prefixes containing a dot. In the `extract-images` CLI, this caused all output images of a type to be written to the same path for a document containing a non-extension dot in the filename. -- XFA / rendering CLI: Fixed incorrect recognition of document length. `pdf.init_forms()` must be called before `len(pdf)`. - *Improvements and new features* - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates. - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object. From 72b60ed9dbdf278e6653426d632cae35ecefb6c0 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 10 May 2024 21:00:04 +0200 Subject: [PATCH 074/140] consts: clean up comment version classes are no longer deferred because it caused too much bloat --- src/pypdfium2/internal/consts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pypdfium2/internal/consts.py b/src/pypdfium2/internal/consts.py index 28c050c36..0b95d123c 100644 --- a/src/pypdfium2/internal/consts.py +++ b/src/pypdfium2/internal/consts.py @@ -126,9 +126,8 @@ def get(self, key, default_prefix="Unhandled constant"): }) -# known implication: causes eager evaluation of pdfium version if "XFA" in PDFIUM_INFO.flags: - #: [V8/XFA builds only] Convert a PDFium XFA error constant (:attr:`FPDF_ERR_XFA*`) to string. + #: [XFA builds only] Convert a PDFium XFA error constant (:attr:`FPDF_ERR_XFA*`) to string. XFAErrorToStr = _fallback_dict({ pdfium_c.FPDF_ERR_XFALOAD: "Load error", pdfium_c.FPDF_ERR_XFALAYOUT: "Layout error", From 1eab5cbbab9f3ca0a4399103ad6c91b2197ed0dc Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 13 May 2024 18:16:57 +0200 Subject: [PATCH 075/140] PdfPage.get_objects(): increase default recursion depth to align with the extract-images CLI --- src/pypdfium2/_helpers/page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index d34322be0..55acb1e34 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -253,7 +253,7 @@ def gen_content(self): raise PdfiumError("Failed to generate page content.") - def get_objects(self, filter=None, max_depth=2, form=None, level=0): + def get_objects(self, filter=None, max_depth=15, form=None, level=0): """ Iterate through the page objects on this page. From ca9c964fb18fff38884d3a4e79e88e9ea2c36b3f Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 13 May 2024 18:15:42 +0200 Subject: [PATCH 076/140] sligthly update docs for PdfImage.extract() again --- src/pypdfium2/_helpers/pageobjects.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index ae51a6204..45926481f 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -343,15 +343,18 @@ def extract(self, dest, *args, **kwargs): Extract the image into an independently usable file or byte buffer, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits. This method can only extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly. - Otherwise, the pixel data is decoded, and re-encoded using :mod:`PIL`. + Otherwise, the pixel data is decoded and re-encoded using :mod:`PIL`, which is slower and loses the original encoding. For images with simple filters only, ``get_data(decode_simple=True)`` is used to preserve higher bit depth or special color formats not supported by ``FPDF_BITMAP``. - For images with complex filters, we have to resort to :meth:`.get_bitmap`, which can be a lossy operation. + For images with complex filters other than those extracted directly, we have to resort to :meth:`.get_bitmap`. - Note, this method ignores alpha masks, and potentially other data stored separately of the main data stream, which might lead to incorrect representation of the image. + Note, this method is not able to account for alpha masks, and potentially other data stored separately of the main image stream, which might lead to incorrect representation of the image. + + Tip: + The ``pikepdf`` library is capable of preserving the original encoding in many cases where this method is not. Parameters: dest (str | pathlib.Path | io.BytesIO): - File prefix or byte buffer to which the image shall be written. + File path prefix or byte buffer to which the image shall be written. fb_format (str): The image format to use in case it is necessary to (re-)encode the data. """ From f75e075c7a39fa29d3edbc00dd46dc7c1e27b374 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 13 May 2024 19:31:00 +0200 Subject: [PATCH 077/140] Add warning about textpage handles when removing text objects See https://pdfium-review.googlesource.com/c/pdfium/+/118914 --- src/pypdfium2/_helpers/page.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 55acb1e34..aa6065d14 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -228,6 +228,9 @@ def remove_obj(self, pageobj): As of PDFium 5692, detached page objects may be only re-inserted into existing pages of the same document. If the page object is not re-inserted into a page, its ``close()`` method may be called. + Caution: + If the object's :attr:`~.PdfObject.type` is :data:`FPDF_PAGEOBJ_TEXT`, all :class:`.PdfTextPage` handles ought to be closed before removing the object. + Parameters: pageobj (PdfObject): The page object to remove. """ From bc8e18c59efe517e9e946c6c3559025291b0af7e Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 13 May 2024 19:32:17 +0200 Subject: [PATCH 078/140] Explain PdfObject.close() --- src/pypdfium2/_helpers/pageobjects.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 45926481f..2316ae0e6 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -25,6 +25,10 @@ class PdfObject (pdfium_i.AutoCloseable): When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead, depending on the object's :attr:`.type` (e. g. :class:`.PdfImage`). + Note: + :meth:`.PdfObject.close` only takes effect on loose pageobjects. + It is a no-op otherwise, because pageobjects that are part of a page are owned by pdfium, not the caller. + Attributes: raw (FPDF_PAGEOBJECT): The underlying PDFium pageobject handle. From 9a0221462378af4416ddba21c0cac85feeb5dba4 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 13 May 2024 20:03:45 +0200 Subject: [PATCH 079/140] Autoclose textpage handles when removing text pageobject --- src/pypdfium2/_helpers/page.py | 14 ++++++++++++-- tests/test_textpage.py | 23 +++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index aa6065d14..0e9b24568 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -35,6 +35,7 @@ def __init__(self, raw, pdf, formenv): self.raw = raw self.pdf = pdf self.formenv = formenv + self._textpage_wrefs = [] super().__init__(PdfPage._close_impl, self.formenv) @@ -195,6 +196,7 @@ def get_textpage(self): raise PdfiumError("Failed to load text page.") textpage = PdfTextPage(raw_textpage, self) self._add_kid(textpage) + self._textpage_wrefs.append( weakref.ref(textpage) ) return textpage @@ -228,8 +230,8 @@ def remove_obj(self, pageobj): As of PDFium 5692, detached page objects may be only re-inserted into existing pages of the same document. If the page object is not re-inserted into a page, its ``close()`` method may be called. - Caution: - If the object's :attr:`~.PdfObject.type` is :data:`FPDF_PAGEOBJ_TEXT`, all :class:`.PdfTextPage` handles ought to be closed before removing the object. + Note: + If the object's :attr:`~.PdfObject.type` is :data:`FPDF_PAGEOBJ_TEXT`, any :class:`.PdfTextPage` handles to the page should be closed before removing the object. Parameters: pageobj (PdfObject): The page object to remove. @@ -238,6 +240,14 @@ def remove_obj(self, pageobj): if pageobj.page is not self: raise ValueError("The page object you attempted to remove is not part of this page.") + # https://pdfium-review.googlesource.com/c/pdfium/+/118914 + if pageobj.type == pdfium_c.FPDF_PAGEOBJ_TEXT: + for wref in self._textpage_wrefs: + textpage = wref() + if textpage and textpage.raw: + logger.warning(f"When removing a text pageobject, any textpage handles ought to be closed beforehand - auto-closing {textpage}.") + textpage.close() + ok = pdfium_c.FPDFPage_RemoveObject(self, pageobj) if not ok: raise PdfiumError("Failed to remove pageobject.") diff --git a/tests/test_textpage.py b/tests/test_textpage.py index eeacc9e33..be21a5386 100644 --- a/tests/test_textpage.py +++ b/tests/test_textpage.py @@ -3,7 +3,9 @@ import re import pytest +import logging import pypdfium2 as pdfium +import pypdfium2.raw as pdfium_c from .conftest import TestFiles @@ -152,3 +154,24 @@ def test_get_text_bounded_defaults_with_rotation(): text = textpage.get_text_bounded() assert len(text) == 438 + + +@pytest.mark.parametrize("explicit_close", [False, True]) +def test_autoclose_with_remove_obj(caplog, explicit_close): + + pdf = pdfium.PdfDocument(TestFiles.text) + page = pdf[0] + textobj = next( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_TEXT]) ) + assert len(page._textpage_wrefs) == 0 + textpage = page.get_textpage() + assert len(page._textpage_wrefs) == 1 + + if explicit_close: + textpage.close() + with caplog.at_level(logging.WARNING): + page.remove_obj(textobj) + + if explicit_close: + assert not caplog.text + else: + assert f"When removing a text pageobject, any textpage handles ought to be closed beforehand - auto-closing {textpage}." in caplog.text From e38085f72c4233d8cb25df2a6c5d3b625e5c544e Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 13 May 2024 20:15:52 +0200 Subject: [PATCH 080/140] Add some tasks regarding AutoCloseable.close() --- src/pypdfium2/internal/bases.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py index 51ae38fcb..8351ecc17 100644 --- a/src/pypdfium2/internal/bases.py +++ b/src/pypdfium2/internal/bases.py @@ -92,6 +92,9 @@ def _add_kid(self, k): def close(self, _by_parent=False): + # TODO invalidate self.raw if closing object without finalizer (supposedly, when closing a page, child pageobject handles fall invalid) + # TODO remove object from parent's kids cache on finalization to avoid unnecessary accumulation (also for PdfPage._textpage_wrefs) + if not self.raw or not self._finalizer: return False From d23268948e3e240fdd2c2bb7355a5861f99b9bde Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 14 May 2024 00:06:26 +0200 Subject: [PATCH 081/140] Consistently call `PdfObject` `pageobject` in docs "page object" is slightly unclear - it might be either an object on a page (PdfObject), or an instance of PdfPage. Therefore, call PdfObject "pageobject" (without space) to somewhat outline the difference. --- docs/source/python_api.rst | 4 ++-- docs/source/shell_api.rst | 4 ++-- src/pypdfium2/__main__.py | 2 +- src/pypdfium2/_cli/extract_images.py | 2 +- src/pypdfium2/_helpers/document.py | 4 ++-- src/pypdfium2/_helpers/page.py | 28 +++++++++++++-------------- src/pypdfium2/_helpers/pageobjects.py | 8 ++++---- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/docs/source/python_api.rst b/docs/source/python_api.rst index 9e2c12399..7ca4182b2 100644 --- a/docs/source/python_api.rst +++ b/docs/source/python_api.rst @@ -84,8 +84,8 @@ Page **** .. automodule:: pypdfium2._helpers.page -Page Objects -************ +Pageobjects +*********** .. automodule:: pypdfium2._helpers.pageobjects Text Page diff --git a/docs/source/shell_api.rst b/docs/source/shell_api.rst index adaba15a8..d9cc9fc0a 100644 --- a/docs/source/shell_api.rst +++ b/docs/source/shell_api.rst @@ -46,8 +46,8 @@ Image Converter .. command-output:: pypdfium2 imgtopdf --help -Page Objects Info -***************** +Pageobjects Info +**************** .. command-output:: pypdfium2 pageobjects --help diff --git a/src/pypdfium2/__main__.py b/src/pypdfium2/__main__.py index 55e5f1826..0bd7e580b 100644 --- a/src/pypdfium2/__main__.py +++ b/src/pypdfium2/__main__.py @@ -16,7 +16,7 @@ "extract-images": "extract images", "extract-text": "extract text", "imgtopdf": "convert images to PDF", - "pageobjects": "print info on page objects", + "pageobjects": "print info on pageobjects", "pdfinfo": "print info on document and pages", "render": "rasterize pages", "tile": "tile pages (N-up)", diff --git a/src/pypdfium2/_cli/extract_images.py b/src/pypdfium2/_cli/extract_images.py index 3a21b6df4..df4e18e65 100644 --- a/src/pypdfium2/_cli/extract_images.py +++ b/src/pypdfium2/_cli/extract_images.py @@ -22,7 +22,7 @@ def attach(parser): "--max-depth", type = int, default = 15, - help = "Maximum recursion depth to consider when looking for page objects.", + help = "Maximum recursion depth to consider when looking for pageobjects.", ) parser.add_argument( "--use-bitmap", diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 0ab49b85c..4a379def6 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -607,8 +607,8 @@ def parent(self): # AutoCloseable hook def as_pageobject(self): """ Returns: - PdfObject: An independent page object representation of the XObject. - If multiple page objects are created from one XObject, they share resources. + PdfObject: An independent pageobject representation of the XObject. + If multiple pageobjects are created from one XObject, they share resources. Pageobjects created from an XObject remain valid after the XObject is closed. """ raw_pageobj = pdfium_c.FPDF_NewFormObjectFromXObject(self) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 0e9b24568..518e4e497 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -202,15 +202,15 @@ def get_textpage(self): def insert_obj(self, pageobj): """ - Insert a page object into the page. + Insert a pageobject into the page. - The page object must not belong to a page yet. If it belongs to a PDF, this page must be part of the PDF. + The pageobject must not belong to a page yet. If it belongs to a PDF, this page must be part of the PDF. Position and form are defined by the object's matrix. If it is the identity matrix, the object will appear as-is on the bottom left corner of the page. Parameters: - pageobj (PdfObject): The page object to insert. + pageobj (PdfObject): The pageobject to insert. """ if pageobj.page: @@ -226,19 +226,19 @@ def insert_obj(self, pageobj): def remove_obj(self, pageobj): """ - Remove a page object from the page. - As of PDFium 5692, detached page objects may be only re-inserted into existing pages of the same document. - If the page object is not re-inserted into a page, its ``close()`` method may be called. + Remove a pageobject from the page. + As of PDFium 5692, detached pageobjects may be only re-inserted into existing pages of the same document. + If the pageobject is not re-inserted into a page, its ``close()`` method may be called. Note: If the object's :attr:`~.PdfObject.type` is :data:`FPDF_PAGEOBJ_TEXT`, any :class:`.PdfTextPage` handles to the page should be closed before removing the object. Parameters: - pageobj (PdfObject): The page object to remove. + pageobj (PdfObject): The pageobject to remove. """ if pageobj.page is not self: - raise ValueError("The page object you attempted to remove is not part of this page.") + raise ValueError("The pageobject you attempted to remove is not part of this page.") # https://pdfium-review.googlesource.com/c/pdfium/+/118914 if pageobj.type == pdfium_c.FPDF_PAGEOBJ_TEXT: @@ -257,7 +257,7 @@ def remove_obj(self, pageobj): def gen_content(self): """ - Generate page content to apply additions, removals or modifications of page objects. + Generate page content to apply additions, removals or modifications of pageobjects. If page content was changed, this function should be called once before saving the document or re-loading the page. """ @@ -268,18 +268,18 @@ def gen_content(self): def get_objects(self, filter=None, max_depth=15, form=None, level=0): """ - Iterate through the page objects on this page. + Iterate through the pageobjects on this page. Parameters: filter (list[int] | None): - An optional list of page object types to filter (:attr:`FPDF_PAGEOBJ_*`). + An optional list of pageobject types to filter (:attr:`FPDF_PAGEOBJ_*`). Any objects whose type is not contained will be skipped. If None or empty, all objects will be provided, regardless of their type. max_depth (int): Maximum recursion depth to consider when descending into Form XObjects. Yields: - :class:`.PdfObject`: A page object. + :class:`.PdfObject`: A pageobject. """ # TODO close skipped objects explicitly ? @@ -295,13 +295,13 @@ def get_objects(self, filter=None, max_depth=15, form=None, level=0): n_objects = count_objects(parent) if n_objects < 0: - raise PdfiumError("Failed to get number of page objects.") + raise PdfiumError("Failed to get number of pageobjects.") for i in range(n_objects): raw_obj = get_object(parent, i) if not raw_obj: - raise PdfiumError("Failed to get page object.") + raise PdfiumError("Failed to get pageobject.") helper_obj = PdfObject(raw_obj, page=self, pdf=self.pdf, level=level) self._add_kid(helper_obj) diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 2316ae0e6..6f3f66e1d 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -21,7 +21,7 @@ class PdfObject (pdfium_i.AutoCloseable): """ - Page object helper class. + Pageobject helper class. When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead, depending on the object's :attr:`.type` (e. g. :class:`.PdfImage`). @@ -146,14 +146,14 @@ def set_matrix(self, matrix): def transform(self, matrix): """ Parameters: - matrix (PdfMatrix): Multiply the page object's current transform matrix by this matrix. + matrix (PdfMatrix): Multiply the pageobject's current transform matrix by this matrix. """ pdfium_c.FPDFPageObj_Transform(self, *matrix.get()) class PdfImage (PdfObject): """ - Image object helper class (specific kind of page object). + Image object helper class (specific kind of pageobject). """ # cf. https://crbug.com/pdfium/1203 @@ -287,7 +287,7 @@ def get_bitmap(self, render=False): if render: if self.pdf is None: - raise RuntimeError("Cannot get rendered bitmap of loose page object.") + raise RuntimeError("Cannot get rendered bitmap of loose pageobject.") raw_bitmap = pdfium_c.FPDFImageObj_GetRenderedBitmap(self.pdf, self.page, self) else: raw_bitmap = pdfium_c.FPDFImageObj_GetBitmap(self) From 428f4c396a1e3335b3c3cc54a02d4b638ccd8fb7 Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 14 May 2024 01:37:23 +0200 Subject: [PATCH 082/140] PdfiumError: don't state the obvious --- src/pypdfium2/_helpers/misc.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/pypdfium2/_helpers/misc.py b/src/pypdfium2/_helpers/misc.py index 9b19e0d54..977d1148c 100644 --- a/src/pypdfium2/_helpers/misc.py +++ b/src/pypdfium2/_helpers/misc.py @@ -10,9 +10,6 @@ class PdfiumError (RuntimeError): Attributes: err_code (int | None): PDFium error code, for programmatic handling of error subtypes, if provided by the API in question (e.g. document loading). None otherwise. - - Tip: - Use ``str(exc)`` to get the message of a caught exception. """ def __init__(self, msg, err_code=None): From 7cf09d8493e5e1e11f6c657650aedc97aaf46457 Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 14 May 2024 01:58:01 +0200 Subject: [PATCH 083/140] docs/conf.py: comment out namedtuple handler We no longer have any public namedtuples after the removal of PdfOutlineItem and PdfBitmapInfo. --- docs/source/conf.py | 14 ++++++-------- src/pypdfium2/_helpers/pageobjects.py | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 1ca0449fa..e82d797a4 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -8,7 +8,7 @@ import os import sys import time -import collections +# import collections from pathlib import Path sys.path.insert(0, str(Path(__file__).parents[2] / "setupsrc")) @@ -59,7 +59,6 @@ "members": True, "undoc-members": True, "show-inheritance": True, - # "inherited-members": True, "member-order": "bysource", } intersphinx_mapping = { @@ -75,12 +74,11 @@ .. |have_changes| replace:: {have_changes} """ -def remove_namedtuple_aliases(app, what, name, obj, skip, options): - if type(obj) is collections._tuplegetter: - return True - return skip - +# def remove_namedtuple_aliases(app, what, name, obj, skip, options): +# if type(obj) is collections._tuplegetter: +# return True +# return skip def setup(app): - app.connect('autodoc-skip-member', remove_namedtuple_aliases) + # app.connect('autodoc-skip-member', remove_namedtuple_aliases) app.add_config_value("have_changes", True, "env") diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 6f3f66e1d..5dddd3664 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -35,7 +35,7 @@ class PdfObject (pdfium_i.AutoCloseable): type (int): The object's type (:data:`FPDF_PAGEOBJ_*`). page (PdfPage): - Reference to the page this pageobject belongs to. May be None if it does not belong to a page yet. + Reference to the page this pageobject belongs to. May be None if not part of a page (e.g. new or detached object). pdf (PdfDocument): Reference to the document this pageobject belongs to. May be None if the object does not belong to a document yet. This attribute is always set if :attr:`.page` is set. From 5c66a32085d86f5340f3a0dad8cf8cba76dae85f Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 14 May 2024 02:16:43 +0200 Subject: [PATCH 084/140] PdfBitmap: slightly improve docs for `new_foreign{_simple}()` --- src/pypdfium2/_helpers/bitmap.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index df67c0599..c2f75f519 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -149,8 +149,9 @@ def new_native(cls, width, height, format, rev_byteorder=False, buffer=None, str def new_foreign(cls, width, height, format, rev_byteorder=False, force_packed=False): """ Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by PDFium. + There may be a padding of unused bytes at line end, unless *force_packed=True* is given. - Using this method is discouraged. Prefer :meth:`.new_native` instead. + Note that is encouraged to prefer :meth:`.new_native`. """ stride = width * pdfium_i.BitmapTypeToNChannels[format] if force_packed else 0 raw = pdfium_c.FPDFBitmap_CreateEx(width, height, format, None, stride) @@ -160,10 +161,9 @@ def new_foreign(cls, width, height, format, rev_byteorder=False, force_packed=Fa @classmethod def new_foreign_simple(cls, width, height, use_alpha, rev_byteorder=False): """ - Create a new bitmap using :func:`FPDFBitmap_Create`. The buffer is allocated by PDFium. - The resulting bitmap is supposed to be packed (i. e. no gap of unused bytes between lines). + Create a new bitmap using :func:`FPDFBitmap_Create`. The buffer is allocated by PDFium, and supposed to be packed (i. e. no gap of unused bytes between lines). - Using this method is discouraged. Prefer :meth:`.new_native` instead. + Note that it is encouraged to prefer :meth:`.new_native`. """ raw = pdfium_c.FPDFBitmap_Create(width, height, use_alpha) return cls.from_raw(raw, rev_byteorder) From c8e4a06b5b17f254c635098bc4d1197f51b4efbd Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 16 May 2024 00:44:05 +0200 Subject: [PATCH 085/140] Handle GetCharIndexAtPos() conforming with pdfium docs --- src/pypdfium2/_helpers/textpage.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py index d7eb53779..19cdb4ac6 100644 --- a/src/pypdfium2/_helpers/textpage.py +++ b/src/pypdfium2/_helpers/textpage.py @@ -172,11 +172,14 @@ def get_index(self, x, y, x_tol, y_tol): y_tol (float): Vertical tolerance. Returns: int | None: The index of the character at or nearby the point (x, y). - May be None if there is no character or an error occurred. + May be None if there is no character. If an internal error occurred, an exception will be raised. """ index = pdfium_c.FPDFText_GetCharIndexAtPos(self, x, y, x_tol, y_tol) - if index < 0: + if index == -1: return None + elif index == -3: + raise PdfiumError("An error occurred on attempt to get char index by pos.") + assert index >= 0, "Negative return is not permitted (unhandled error code?)" return index From 14a7fbbf567cf0cc4926591788880a2ffd9a6975 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sat, 18 May 2024 23:32:34 +0200 Subject: [PATCH 086/140] PdfPage.get_objects(): don't register objects as kids This was especially problematic as weakrefs are not cleaned up when the object in question is closed/collected, so we potentially store many dead pointers. Imagine a caller invoking get_objects() repeatedly for iterating and a page handle living for a long time afterwards - that somewhat resembles a memory leak. --- src/pypdfium2/_helpers/page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 518e4e497..b539850fd 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -303,8 +303,8 @@ def get_objects(self, filter=None, max_depth=15, form=None, level=0): if not raw_obj: raise PdfiumError("Failed to get pageobject.") + # Not a child object, because the lifetime of pageobjects that are part of a page is managed by pdfium. The .page reference is enough to keep the parent alive, unless the caller explicitly closes it (which may not merit storing countless of weakrefs). helper_obj = PdfObject(raw_obj, page=self, pdf=self.pdf, level=level) - self._add_kid(helper_obj) if not filter or helper_obj.type in filter: yield helper_obj From 992e9fe49a913b261393c8dd9460c40aaecf5542 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sat, 18 May 2024 23:35:21 +0200 Subject: [PATCH 087/140] abstractly reformulate bases task It might still be worth doing for the sake of conceptual correctness, even if not currently relevant for practice. --- src/pypdfium2/internal/bases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py index 8351ecc17..976763b91 100644 --- a/src/pypdfium2/internal/bases.py +++ b/src/pypdfium2/internal/bases.py @@ -92,7 +92,7 @@ def _add_kid(self, k): def close(self, _by_parent=False): - # TODO invalidate self.raw if closing object without finalizer (supposedly, when closing a page, child pageobject handles fall invalid) + # TODO invalidate self.raw if closing object without finalizer to prevent access after a lifetime-managing parent is closed # TODO remove object from parent's kids cache on finalization to avoid unnecessary accumulation (also for PdfPage._textpage_wrefs) if not self.raw or not self._finalizer: From 59d0e99010d3adf6163debc3cc0bd7eec97f834e Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 28 May 2024 00:32:00 +0200 Subject: [PATCH 088/140] CLI/extract-images: Fix another dotted filepath blunder Same as 7ce4d31a302c2fdc50185e35fc67513d6b3ea373. --- src/pypdfium2/_cli/extract_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/_cli/extract_images.py b/src/pypdfium2/_cli/extract_images.py index df4e18e65..efa7790cb 100644 --- a/src/pypdfium2/_cli/extract_images.py +++ b/src/pypdfium2/_cli/extract_images.py @@ -68,7 +68,7 @@ def main(args): try: if args.use_bitmap: pil_image = image.get_bitmap(render=args.render).to_pil() - pil_image.save( prefix.with_suffix("."+args.format) ) + pil_image.save(f"{prefix}.{args.format}") else: image.extract(prefix, fb_format=args.format) except pdfium.PdfiumError: From af81b4754ea32292fb2db43a04b41c190d9dcf25 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 31 May 2024 16:30:31 +0200 Subject: [PATCH 089/140] Remove separate `_textpage_wrefs` Given that PdfPage.get_objects() no longer falsely registers pageobjects as kids, textpages are now the only members added to a page's kids cache, nicely simplifying this code passage. However, even more future proof would be to turn kids into a mapping {"type": [*objects], ...}, rather than a shallow list of mixed types, so we could access all kids of a type without overhead. --- src/pypdfium2/_helpers/page.py | 13 ++++++------- src/pypdfium2/internal/bases.py | 3 ++- tests/test_textpage.py | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index b539850fd..f992c723c 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -35,7 +35,6 @@ def __init__(self, raw, pdf, formenv): self.raw = raw self.pdf = pdf self.formenv = formenv - self._textpage_wrefs = [] super().__init__(PdfPage._close_impl, self.formenv) @@ -196,7 +195,6 @@ def get_textpage(self): raise PdfiumError("Failed to load text page.") textpage = PdfTextPage(raw_textpage, self) self._add_kid(textpage) - self._textpage_wrefs.append( weakref.ref(textpage) ) return textpage @@ -242,11 +240,12 @@ def remove_obj(self, pageobj): # https://pdfium-review.googlesource.com/c/pdfium/+/118914 if pageobj.type == pdfium_c.FPDF_PAGEOBJ_TEXT: - for wref in self._textpage_wrefs: - textpage = wref() - if textpage and textpage.raw: - logger.warning(f"When removing a text pageobject, any textpage handles ought to be closed beforehand - auto-closing {textpage}.") - textpage.close() + for wref in self._kids: + obj = wref() + if obj and obj.raw: + assert isinstance(obj, PdfTextPage), "This code assumes all kids of a page are textpages." + logger.warning(f"Removing text pageobbject implicitly closes affected textpage {obj}.") + obj.close() ok = pdfium_c.FPDFPage_RemoveObject(self, pageobj) if not ok: diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py index 976763b91..ba347f528 100644 --- a/src/pypdfium2/internal/bases.py +++ b/src/pypdfium2/internal/bases.py @@ -92,8 +92,9 @@ def _add_kid(self, k): def close(self, _by_parent=False): + # TODO remove object from parent's kids cache on finalization to avoid unnecessary accumulation + # -> pre-requisite would be to handle kids inside finalizer, but IIRC there was some weird issue with that? # TODO invalidate self.raw if closing object without finalizer to prevent access after a lifetime-managing parent is closed - # TODO remove object from parent's kids cache on finalization to avoid unnecessary accumulation (also for PdfPage._textpage_wrefs) if not self.raw or not self._finalizer: return False diff --git a/tests/test_textpage.py b/tests/test_textpage.py index be21a5386..60789d8a3 100644 --- a/tests/test_textpage.py +++ b/tests/test_textpage.py @@ -162,9 +162,9 @@ def test_autoclose_with_remove_obj(caplog, explicit_close): pdf = pdfium.PdfDocument(TestFiles.text) page = pdf[0] textobj = next( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_TEXT]) ) - assert len(page._textpage_wrefs) == 0 + assert len(page._kids) == 0 textpage = page.get_textpage() - assert len(page._textpage_wrefs) == 1 + assert len(page._kids) == 1 if explicit_close: textpage.close() @@ -174,4 +174,4 @@ def test_autoclose_with_remove_obj(caplog, explicit_close): if explicit_close: assert not caplog.text else: - assert f"When removing a text pageobject, any textpage handles ought to be closed beforehand - auto-closing {textpage}." in caplog.text + assert f"Removing text pageobbject implicitly closes affected textpage {textpage}." in caplog.text From 45de679f7005bd7bac08b62ae575598d2deb7c57 Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 4 Jun 2024 18:24:50 +0200 Subject: [PATCH 090/140] Clarify `Cannot close object; library is destroyed` condition CC https://github.com/mindee/doctr/pull/1624 --- src/pypdfium2/internal/bases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py index ba347f528..823cfabe3 100644 --- a/src/pypdfium2/internal/bases.py +++ b/src/pypdfium2/internal/bases.py @@ -39,7 +39,7 @@ def _close_template(close_func, raw, obj_repr, state, parent, *args, **kwargs): os.write(sys.stderr.fileno(), f"Close ({desc}) {obj_repr}\n".encode()) if not LIBRARY_AVAILABLE: - os.write(sys.stderr.fileno(), f"-> Cannot close object, library is destroyed. This may cause a memory leak!\n".encode()) + os.write(sys.stderr.fileno(), f"-> Cannot close object; library is destroyed. This may happen on process exit, but should not during runtime.\n".encode()) return assert parent is None or not parent._tree_closed() From 3596eb09b67a609e7889d1729d7503a119ca55a8 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Jul 2024 12:32:56 +0200 Subject: [PATCH 091/140] Correct PdfBookmark.get_count() docstring --- src/pypdfium2/_helpers/document.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 4a379def6..abff23207 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -647,8 +647,9 @@ def get_title(self): def get_count(self): """ Returns: - int: Signed number of direct child bookmarks (i.e. non-recursive). Zero if the bookmark has no descendants. - The initial state shall be closed (collapsed) if negative, open (expanded) if positive. + int: Signed number of child bookmarks that would be visible if the bookmark were open (i.e. recursively counting children of open children). + The bookmark's initial state is open (expanded) if the number is positive, closed (collapsed) if negative. + Zero if the bookmark has no descendants. """ return pdfium_c.FPDFBookmark_GetCount(self) From 85eadfbe6010db46971fe21db5f2a30da534a9b1 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 11 Jul 2024 12:33:05 +0200 Subject: [PATCH 092/140] rendering: lightness inversion for PIL --- src/pypdfium2/_cli/render.py | 88 +++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 12 deletions(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index 2236d346a..6b39f7cb9 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -4,11 +4,18 @@ import os import math import logging +import colorsys import functools from pathlib import Path import multiprocessing as mp import concurrent.futures as ft +try: + import PIL.Image + import PIL.ImageFilter + import PIL.ImageDraw +except ImportError: + PIL = None try: import cv2 except ImportError: @@ -188,6 +195,21 @@ def attach(parser): type = str.lower, help = "The map function to use (backend specific, the default is an iterative map)." ) + + postproc = parser.add_argument_group( + title = "Post processing", + description = "Options to post-process rendered images. Note, this may have a strongly negative impact on performance.", + ) + postproc.add_argument( + "--invert-lightness", + action = "store_true", + help = "Invert lightness using the HLS color space (e.g. white<->black, dark_blue<->light_blue). The intent is to achieve a dark theme for documents with light background, while providing better visual results than classical color inversion or a flat pdfium color scheme.", + ) + postproc.add_argument( + "--exclude-images", + action = "store_true", + help = "Whether to exclude PDF images from lightness inversion.", + ) class SavingEngine: @@ -199,22 +221,58 @@ def _get_path(self, i): output_dir, prefix, n_digits, format = self._path_parts return output_dir / f"{prefix}{i+1:0{n_digits}d}.{format}" - def __call__(self, bitmap, i): + def __call__(self, i, bitmap, page, postproc_kwargs): out_path = self._get_path(i) - self._saving_hook(out_path, bitmap) + self._saving_hook(out_path, bitmap, page, postproc_kwargs) logger.info(f"Wrote page {i+1} as {out_path.name}") class PILEngine (SavingEngine): - def _saving_hook(self, out_path, bitmap): - bitmap.to_pil().save(out_path) + + def _saving_hook(self, out_path, bitmap, page, postproc_kwargs): + pil_image = bitmap.to_pil() + posconv = bitmap.get_posconv(page) + pil_image = self.postprocess(pil_image, page, posconv, **postproc_kwargs) + pil_image.save(out_path) + + LINV_LUT_SIZE = 17 + + @staticmethod + def _invert_px_lightness(r, g, b): + h, l, s = colorsys.rgb_to_hls(r, g, b) + l = 1 - l + return colorsys.hls_to_rgb(h, l, s) + + @classmethod + @functools.lru_cache(maxsize=1) + def _get_linv_lut(cls): + return PIL.ImageFilter.Color3DLUT.generate(cls.LINV_LUT_SIZE, cls._invert_px_lightness) + + @classmethod + def postprocess(cls, image, page, posconv, invert_lightness, exclude_images): + out_image = image + if invert_lightness: + out_image = image.filter(cls._get_linv_lut()) + if exclude_images: + # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates + images = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) + if len(images) > 0: + mask = PIL.Image.new("1", image.size) + draw = PIL.ImageDraw.Draw(mask) + for obj in images: + qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()] + draw.polygon(qpoints, fill=1, outline=1) + out_image.paste(image, mask=mask) + return out_image + class NumpyCV2Engine (SavingEngine): - def _saving_hook(self, out_path, bitmap): + def _saving_hook(self, out_path, bitmap, page, postproc_kwargs): + # TODO post-processing cv2.imwrite(str(out_path), bitmap.to_numpy()) -def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine): +def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine, postproc_kwargs): if extra_init: extra_init() @@ -226,17 +284,18 @@ def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, e pdf.init_forms() global ProcObjs - ProcObjs = (pdf, kwargs, engine) + ProcObjs = (pdf, kwargs, engine, postproc_kwargs) -def _render_job(i, pdf, kwargs, engine): +def _render_job(i, pdf, kwargs, engine, postproc_kwargs): # logger.info(f"Started page {i+1} ...") page = pdf[i] bitmap = page.render(**kwargs) - engine(bitmap, i) + engine(i, bitmap, page, postproc_kwargs) def _render_parallel_job(i): - global ProcObjs; _render_job(i, *ProcObjs) + global ProcObjs + _render_job(i, *ProcObjs) # TODO turn into a python-usable API yielding output paths as they are written @@ -288,6 +347,11 @@ def main(args): for type in args.no_antialias: kwargs[f"no_smooth{type}"] = True + postproc_kwargs = dict( + invert_lightness = args.invert_lightness, + exclude_images = args.exclude_images, + ) + # TODO dump all args except password? logger.info(f"{args.engine_cls.__name__}, Format: {args.format}, rev_byteorder: {args.rev_byteorder}, prefer_bgrx {args.prefer_bgrx}") @@ -299,7 +363,7 @@ def main(args): logger.info("Linear rendering ...") for i in args.pages: - _render_job(i, pdf, kwargs, engine) + _render_job(i, pdf, kwargs, engine, postproc_kwargs) else: @@ -317,7 +381,7 @@ def main(args): extra_init = (setup_logging if args.parallel_strategy in ("spawn", "forkserver") else None) pool_kwargs = dict( initializer = _render_parallel_init, - initargs = (extra_init, pdf._input, args.password, args.draw_forms, kwargs, engine), + initargs = (extra_init, pdf._input, args.password, args.draw_forms, kwargs, engine, postproc_kwargs), ) n_procs = min(args.processes, len(args.pages)) From c907e1ebb74a2f1a406bcfba91fee663612889b5 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 11 Jul 2024 14:23:32 +0200 Subject: [PATCH 093/140] Add OpenCV lightness inversion TODO: image exclusion --- src/pypdfium2/_cli/render.py | 46 ++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index 6b39f7cb9..47b5d5643 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -230,46 +230,66 @@ def __call__(self, i, bitmap, page, postproc_kwargs): class PILEngine (SavingEngine): def _saving_hook(self, out_path, bitmap, page, postproc_kwargs): - pil_image = bitmap.to_pil() posconv = bitmap.get_posconv(page) + pil_image = bitmap.to_pil() pil_image = self.postprocess(pil_image, page, posconv, **postproc_kwargs) pil_image.save(out_path) - LINV_LUT_SIZE = 17 - @staticmethod def _invert_px_lightness(r, g, b): h, l, s = colorsys.rgb_to_hls(r, g, b) l = 1 - l return colorsys.hls_to_rgb(h, l, s) + LINV_LUT_SIZE = 17 + @classmethod @functools.lru_cache(maxsize=1) def _get_linv_lut(cls): return PIL.ImageFilter.Color3DLUT.generate(cls.LINV_LUT_SIZE, cls._invert_px_lightness) @classmethod - def postprocess(cls, image, page, posconv, invert_lightness, exclude_images): - out_image = image + def postprocess(cls, orig_image, page, posconv, invert_lightness, exclude_images): + out_image = orig_image if invert_lightness: - out_image = image.filter(cls._get_linv_lut()) + out_image = out_image.filter(cls._get_linv_lut()) if exclude_images: # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates - images = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) - if len(images) > 0: - mask = PIL.Image.new("1", image.size) + image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) + if len(image_objs) > 0: + mask = PIL.Image.new("1", orig_image.size) draw = PIL.ImageDraw.Draw(mask) - for obj in images: + for obj in image_objs: qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()] draw.polygon(qpoints, fill=1, outline=1) - out_image.paste(image, mask=mask) + out_image.paste(orig_image, mask=mask) return out_image class NumpyCV2Engine (SavingEngine): + def _saving_hook(self, out_path, bitmap, page, postproc_kwargs): - # TODO post-processing - cv2.imwrite(str(out_path), bitmap.to_numpy()) + np_array = bitmap.to_numpy() + np_array = self.postprocess(np_array, bitmap, page, **postproc_kwargs) + cv2.imwrite(str(out_path), np_array) + + @classmethod + def postprocess(cls, image, bitmap, page, invert_lightness, exclude_images): + if invert_lightness: + # posconv = bitmap.get_posconv(page) + assert bitmap.format == pdfium_r.FPDFBitmap_BGR, "Lightness inversion is only implemented for RGB/BGR" + if bitmap.rev_byteorder: + convert_to = cv2.COLOR_RGB2HLS + convert_from = cv2.COLOR_HLS2RGB + else: + convert_to = cv2.COLOR_BGR2HLS + convert_from = cv2.COLOR_HLS2BGR + image = cv2.cvtColor(image, convert_to) + h, l, s = cv2.split(image) + l = ~l + image = cv2.merge([h, l, s]) + image = cv2.cvtColor(image, convert_from) + return image def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine, postproc_kwargs): From 736101d36159833b334790cdc32a662b3b874539 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 11 Jul 2024 15:35:43 +0200 Subject: [PATCH 094/140] Implement opencv image exclusion Thew, that was tough. In particular, the argument order for copyTo() was really confusing, because the C signature is (src, dst, mask), whereas the python signature is (src, mask, dst). --- src/pypdfium2/_cli/render.py | 38 ++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index 47b5d5643..fbd1c7395 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -18,8 +18,10 @@ PIL = None try: import cv2 + import numpy as np except ImportError: cv2 = None + np = None import pypdfium2._helpers as pdfium import pypdfium2.internal as pdfium_i @@ -249,21 +251,21 @@ def _get_linv_lut(cls): return PIL.ImageFilter.Color3DLUT.generate(cls.LINV_LUT_SIZE, cls._invert_px_lightness) @classmethod - def postprocess(cls, orig_image, page, posconv, invert_lightness, exclude_images): - out_image = orig_image + def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images): + dst_image = src_image if invert_lightness: - out_image = out_image.filter(cls._get_linv_lut()) + dst_image = dst_image.filter(cls._get_linv_lut()) if exclude_images: # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) if len(image_objs) > 0: - mask = PIL.Image.new("1", orig_image.size) + mask = PIL.Image.new("1", src_image.size) draw = PIL.ImageDraw.Draw(mask) for obj in image_objs: qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()] draw.polygon(qpoints, fill=1, outline=1) - out_image.paste(orig_image, mask=mask) - return out_image + dst_image.paste(src_image, mask=mask) + return dst_image class NumpyCV2Engine (SavingEngine): @@ -274,9 +276,9 @@ def _saving_hook(self, out_path, bitmap, page, postproc_kwargs): cv2.imwrite(str(out_path), np_array) @classmethod - def postprocess(cls, image, bitmap, page, invert_lightness, exclude_images): + def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): + dst_image = src_image if invert_lightness: - # posconv = bitmap.get_posconv(page) assert bitmap.format == pdfium_r.FPDFBitmap_BGR, "Lightness inversion is only implemented for RGB/BGR" if bitmap.rev_byteorder: convert_to = cv2.COLOR_RGB2HLS @@ -284,12 +286,22 @@ def postprocess(cls, image, bitmap, page, invert_lightness, exclude_images): else: convert_to = cv2.COLOR_BGR2HLS convert_from = cv2.COLOR_HLS2BGR - image = cv2.cvtColor(image, convert_to) - h, l, s = cv2.split(image) + dst_image = cv2.cvtColor(dst_image, convert_to) + h, l, s = cv2.split(dst_image) l = ~l - image = cv2.merge([h, l, s]) - image = cv2.cvtColor(image, convert_from) - return image + dst_image = cv2.merge([h, l, s]) + dst_image = cv2.cvtColor(dst_image, convert_from) + if exclude_images: + posconv = bitmap.get_posconv(page) + image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) + if len(image_objs) > 0: + mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8) + for obj in image_objs: + qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()] + qpoints = np.array(qpoints, np.int32) + cv2.fillPoly(mask, [qpoints], 1) + cv2.copyTo(src_image, mask=mask, dst=dst_image) + return dst_image def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine, postproc_kwargs): From 822c1b763494e333cf29e8b06cc2048ee53521ce Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 11 Jul 2024 15:42:09 +0200 Subject: [PATCH 095/140] opencv: fill all polygons in one go --- src/pypdfium2/_cli/render.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index fbd1c7395..7e12d3b3f 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -296,10 +296,8 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) if len(image_objs) > 0: mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8) - for obj in image_objs: - qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()] - qpoints = np.array(qpoints, np.int32) - cv2.fillPoly(mask, [qpoints], 1) + polygons = [np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32) for obj in image_objs] + cv2.fillPoly(mask, polygons, 1) cv2.copyTo(src_image, mask=mask, dst=dst_image) return dst_image From 2746244996df924b42cf28b16081e3f481f195ed Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 11 Jul 2024 15:43:15 +0200 Subject: [PATCH 096/140] Revert "opencv: fill all polygons in one go" This did the wrong thing when polygons intersect. --- src/pypdfium2/_cli/render.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index 7e12d3b3f..a2f954357 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -296,8 +296,9 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) if len(image_objs) > 0: mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8) - polygons = [np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32) for obj in image_objs] - cv2.fillPoly(mask, polygons, 1) + for obj in image_objs: + qpoints = np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32) + cv2.fillPoly(mask, [qpoints], 1) cv2.copyTo(src_image, mask=mask, dst=dst_image) return dst_image From e68d3da5483af1cada880324123e90f8074b4641 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 11 Jul 2024 15:46:44 +0200 Subject: [PATCH 097/140] Add some line breaks --- src/pypdfium2/_cli/render.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index a2f954357..bdffbc5fa 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -278,19 +278,23 @@ def _saving_hook(self, out_path, bitmap, page, postproc_kwargs): @classmethod def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): dst_image = src_image + if invert_lightness: assert bitmap.format == pdfium_r.FPDFBitmap_BGR, "Lightness inversion is only implemented for RGB/BGR" + if bitmap.rev_byteorder: convert_to = cv2.COLOR_RGB2HLS convert_from = cv2.COLOR_HLS2RGB else: convert_to = cv2.COLOR_BGR2HLS convert_from = cv2.COLOR_HLS2BGR + dst_image = cv2.cvtColor(dst_image, convert_to) h, l, s = cv2.split(dst_image) l = ~l dst_image = cv2.merge([h, l, s]) dst_image = cv2.cvtColor(dst_image, convert_from) + if exclude_images: posconv = bitmap.get_posconv(page) image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) @@ -300,6 +304,7 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): qpoints = np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32) cv2.fillPoly(mask, [qpoints], 1) cv2.copyTo(src_image, mask=mask, dst=dst_image) + return dst_image From 428e970375b711aa769bd742661a531a55ae0987 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 12 Jul 2024 14:01:00 +0200 Subject: [PATCH 098/140] pil/polygon: don't draw an outline --- src/pypdfium2/_cli/render.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index bdffbc5fa..d330829d0 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -263,7 +263,7 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images) draw = PIL.ImageDraw.Draw(mask) for obj in image_objs: qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()] - draw.polygon(qpoints, fill=1, outline=1) + draw.polygon(qpoints, fill=1) dst_image.paste(src_image, mask=mask) return dst_image From 2bb67665d8fcd02f5497525fb67adc6799b0c277 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 12 Jul 2024 14:07:47 +0200 Subject: [PATCH 099/140] Add missing mkdir with refbindings (fixes #320) --- setupsrc/pypdfium2_setup/packaging_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py index 099a9ac59..887a6c6b1 100644 --- a/setupsrc/pypdfium2_setup/packaging_base.py +++ b/setupsrc/pypdfium2_setup/packaging_base.py @@ -494,6 +494,7 @@ def build_pdfium_bindings(version, headers_dir=None, **kwargs): flags_diff = set(kwargs["flags"]).difference(REFBINDINGS_FLAGS) if flags_diff: # == not set(...).issubset(...) print(f"Warning: The following requested flags are not available in the reference bindings and will be discarded: {flags_diff}") + DataDir_Bindings.mkdir(parents=True, exist_ok=True) shutil.copyfile(RefBindingsFile, DataDir_Bindings/BindingsFN) write_json(ver_path, dict(version=bindings_ver, flags=REFBINDINGS_FLAGS, run_lds=["."], source="reference")) return From 775fb491e3283ebfdbb4a572285e65ed51f6ac83 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 12 Jul 2024 14:37:12 +0200 Subject: [PATCH 100/140] lightness inversion: expand pixel formats compat --- src/pypdfium2/_cli/render.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index d330829d0..5151a7d4a 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -12,6 +12,7 @@ try: import PIL.Image + import PIL.ImageOps import PIL.ImageFilter import PIL.ImageDraw except ImportError: @@ -254,7 +255,10 @@ def _get_linv_lut(cls): def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images): dst_image = src_image if invert_lightness: - dst_image = dst_image.filter(cls._get_linv_lut()) + if src_image.mode == "L": + dst_image = PIL.ImageOps.invert(src_image) + else: + dst_image = dst_image.filter(cls._get_linv_lut()) if exclude_images: # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) @@ -280,22 +284,26 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): dst_image = src_image if invert_lightness: - assert bitmap.format == pdfium_r.FPDFBitmap_BGR, "Lightness inversion is only implemented for RGB/BGR" - if bitmap.rev_byteorder: - convert_to = cv2.COLOR_RGB2HLS - convert_from = cv2.COLOR_HLS2RGB + if bitmap.format == pdfium_r.FPDFBitmap_Gray: + dst_image = 255 - src_image else: - convert_to = cv2.COLOR_BGR2HLS - convert_from = cv2.COLOR_HLS2BGR - - dst_image = cv2.cvtColor(dst_image, convert_to) - h, l, s = cv2.split(dst_image) - l = ~l - dst_image = cv2.merge([h, l, s]) - dst_image = cv2.cvtColor(dst_image, convert_from) + + if bitmap.rev_byteorder: + convert_to = cv2.COLOR_RGB2HLS + convert_from = cv2.COLOR_HLS2RGB + else: + convert_to = cv2.COLOR_BGR2HLS + convert_from = cv2.COLOR_HLS2BGR + + dst_image = cv2.cvtColor(dst_image, convert_to) + h, l, s = cv2.split(dst_image) + l = ~l + dst_image = cv2.merge([h, l, s]) + dst_image = cv2.cvtColor(dst_image, convert_from) if exclude_images: + assert bitmap.format != pdfium_r.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2" posconv = bitmap.get_posconv(page) image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) if len(image_objs) > 0: @@ -303,7 +311,7 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): for obj in image_objs: qpoints = np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32) cv2.fillPoly(mask, [qpoints], 1) - cv2.copyTo(src_image, mask=mask, dst=dst_image) + dst_image = cv2.copyTo(src_image, mask=mask, dst=dst_image) return dst_image From bc42d19161d0e6a7d18a11c39a92d0d18a5f47b5 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 12 Jul 2024 20:58:39 +0200 Subject: [PATCH 101/140] Remove wrong comments --- src/pypdfium2/_cli/imgtopdf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/pypdfium2/_cli/imgtopdf.py b/src/pypdfium2/_cli/imgtopdf.py index 542d637e2..6c3ec82a6 100644 --- a/src/pypdfium2/_cli/imgtopdf.py +++ b/src/pypdfium2/_cli/imgtopdf.py @@ -38,8 +38,6 @@ def main(args): # Due to limitations in PDFium's public API, this function may be inefficient/lossy for non-JPEG input. # The technically best available open-source tool for image to PDF conversion is probably img2pdf (although its code style can be regarded as displeasing). - # Development note: We are closing objects explicitly because loading JPEGs non-inline binds file handles to the PDF, which need to be released as soon as possible. Without this, we have already run into "OSError: Too many open files" while testing. - pdf = pdfium.PdfDocument.new() for fp in args.images: From 7694cea59c85f219d6bc947f35c55be304909352 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sat, 13 Jul 2024 13:54:22 +0200 Subject: [PATCH 102/140] [Experimental] Defer imports of optional dependencies --- README.md | 8 ++-- src/pypdfium2/_cli/imgtopdf.py | 10 ++--- src/pypdfium2/_cli/render.py | 57 +++++++++++++++------------ src/pypdfium2/_helpers/bitmap.py | 16 +++----- src/pypdfium2/_helpers/pageobjects.py | 15 +++---- 5 files changed, 54 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 1c045589f..7933c090b 100644 --- a/README.md +++ b/README.md @@ -176,12 +176,14 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct ### Runtime Dependencies -As of this writing, pypdfium2 does not need any mandatory runtime dependencies apart from Python itself. +As of this writing, pypdfium2 does not require any mandatory runtime dependencies apart from Python itself. -However, some optional support model features require additional packages: -* [`Pillow`](https://pillow.readthedocs.io/en/stable/) (module name `PIL`) is a pouplar imaging library for Python. pypdfium2 provides convenience methods to translate between raw bitmap buffers and PIL images. +However, some optional support model features need additional packages: +* [`Pillow`](https://pillow.readthedocs.io/en/stable/) (module `PIL`) is a pouplar imaging library for Python. pypdfium2 provides convenience adapters to translate between raw bitmap buffers and PIL images. It also uses PIL for some command-line functionality (e.g. image saving). * [`NumPy`](https://numpy.org/doc/stable/index.html) is a library for scientific computing. Similar to `Pillow`, pypdfium2 provides helpers to get a numpy array view of a raw bitmap. +* [`opencv-python`](https://github.com/opencv/opencv-python) (module `cv2`) is an imaging library built around numpy arrays. It can be used in the rendering CLI to save with pypdfium2's numpy adapter. +pypdfium2 tries to defer imports of optional dependencies to the scopes where they are actually accessed, so there should be no startup overhead if you don't use them. ### Setup Magic diff --git a/src/pypdfium2/_cli/imgtopdf.py b/src/pypdfium2/_cli/imgtopdf.py index 6c3ec82a6..2238267ac 100644 --- a/src/pypdfium2/_cli/imgtopdf.py +++ b/src/pypdfium2/_cli/imgtopdf.py @@ -6,11 +6,6 @@ from pathlib import Path import pypdfium2._helpers as pdfium -try: - import PIL.Image -except ImportError: - PIL = None - def attach(parser): parser.add_argument( @@ -34,6 +29,11 @@ def attach(parser): def main(args): + try: + import PIL.Image + except ImportError: + PIL = None # JPEG can be convered without PIL + # Rudimentary image to PDF conversion (testing / proof of concept) # Due to limitations in PDFium's public API, this function may be inefficient/lossy for non-JPEG input. # The technically best available open-source tool for image to PDF conversion is probably img2pdf (although its code style can be regarded as displeasing). diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index 5151a7d4a..3cf6ecf20 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -9,20 +9,7 @@ from pathlib import Path import multiprocessing as mp import concurrent.futures as ft - -try: - import PIL.Image - import PIL.ImageOps - import PIL.ImageFilter - import PIL.ImageDraw -except ImportError: - PIL = None -try: - import cv2 - import numpy as np -except ImportError: - cv2 = None - np = None +from importlib.util import find_spec import pypdfium2._helpers as pdfium import pypdfium2.internal as pdfium_i @@ -33,6 +20,7 @@ BooleanOptionalAction, ) +have_cv2 = find_spec("cv2") is not None logger = logging.getLogger(__name__) @@ -217,21 +205,31 @@ def attach(parser): class SavingEngine: - def __init__(self, path_parts): + def __init__(self, path_parts, postproc_kwargs): self._path_parts = path_parts + self.postproc_kwargs = postproc_kwargs def _get_path(self, i): output_dir, prefix, n_digits, format = self._path_parts return output_dir / f"{prefix}{i+1:0{n_digits}d}.{format}" - def __call__(self, i, bitmap, page, postproc_kwargs): + def __call__(self, i, bitmap, page): out_path = self._get_path(i) - self._saving_hook(out_path, bitmap, page, postproc_kwargs) + self._saving_hook(out_path, bitmap, page, self.postproc_kwargs) logger.info(f"Wrote page {i+1} as {out_path.name}") class PILEngine (SavingEngine): + def do_imports(self): + if not self.postproc_kwargs["invert_lightness"]: + return + global PIL + import PIL.Image + import PIL.ImageOps + import PIL.ImageFilter + import PIL.ImageDraw + def _saving_hook(self, out_path, bitmap, page, postproc_kwargs): posconv = bitmap.get_posconv(page) pil_image = bitmap.to_pil() @@ -274,6 +272,12 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images) class NumpyCV2Engine (SavingEngine): + @staticmethod + def do_imports(): + global cv2, np + import cv2 + import numpy as np + def _saving_hook(self, out_path, bitmap, page, postproc_kwargs): np_array = bitmap.to_numpy() np_array = self.postprocess(np_array, bitmap, page, **postproc_kwargs) @@ -316,7 +320,7 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): return dst_image -def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine, postproc_kwargs): +def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine): if extra_init: extra_init() @@ -327,15 +331,17 @@ def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, e if may_init_forms: pdf.init_forms() + engine.do_imports() + global ProcObjs - ProcObjs = (pdf, kwargs, engine, postproc_kwargs) + ProcObjs = (pdf, kwargs, engine) -def _render_job(i, pdf, kwargs, engine, postproc_kwargs): +def _render_job(i, pdf, kwargs, engine): # logger.info(f"Started page {i+1} ...") page = pdf[i] bitmap = page.render(**kwargs) - engine(i, bitmap, page, postproc_kwargs) + engine(i, bitmap, page) def _render_parallel_job(i): global ProcObjs @@ -362,7 +368,7 @@ def main(args): # numpy+cv2 is much faster for PNG, and PIL faster for JPG, but this might simply be due to different encoding defaults if args.engine_cls is None: - if cv2 != None and args.format == "png": + if have_cv2 != None and args.format == "png": args.engine_cls = NumpyCV2Engine else: args.engine_cls = PILEngine @@ -401,13 +407,14 @@ def main(args): n_digits = len(str(pdf_len)) path_parts = (args.output, args.prefix, n_digits, args.format) - engine = args.engine_cls(path_parts) + engine = args.engine_cls(path_parts, postproc_kwargs) if len(args.pages) <= args.linear: logger.info("Linear rendering ...") + engine.do_imports() for i in args.pages: - _render_job(i, pdf, kwargs, engine, postproc_kwargs) + _render_job(i, pdf, kwargs, engine) else: @@ -425,7 +432,7 @@ def main(args): extra_init = (setup_logging if args.parallel_strategy in ("spawn", "forkserver") else None) pool_kwargs = dict( initializer = _render_parallel_init, - initargs = (extra_init, pdf._input, args.password, args.draw_forms, kwargs, engine, postproc_kwargs), + initargs = (extra_init, pdf._input, args.password, args.draw_forms, kwargs, engine), ) n_procs = min(args.processes, len(args.pages)) diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index c2f75f519..1d17d1e62 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -11,16 +11,6 @@ logger = logging.getLogger(__name__) -try: - import PIL.Image -except ImportError: - PIL = None - -try: - import numpy -except ImportError: - numpy = None - class PdfBitmap (pdfium_i.AutoCloseable): """ @@ -215,6 +205,8 @@ def to_numpy(self): # https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray + import numpy + array = numpy.ndarray( # layout: row major shape = (self.height, self.width, self.n_channels), @@ -242,6 +234,8 @@ def to_pil(self): # https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.frombuffer # https://pillow.readthedocs.io/en/stable/handbook/writing-your-own-image-plugin.html#the-raw-decoder + import PIL.Image + dest_mode = pdfium_i.BitmapTypeToStrReverse[self.format] image = PIL.Image.frombuffer( dest_mode, # target color format @@ -300,6 +294,8 @@ def get_posconv(self, page): def _pil_convert_for_pdfium(pil_image): + import PIL.Image + if pil_image.mode == "1": pil_image = pil_image.convert("L") elif pil_image.mode.startswith("RGB"): diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 5dddd3664..409bbc1b5 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -13,11 +13,6 @@ from pypdfium2._helpers.matrix import PdfMatrix from pypdfium2._helpers.bitmap import PdfBitmap -try: - import PIL.Image -except ImportError: - PIL = None - class PdfObject (pdfium_i.AutoCloseable): """ @@ -384,13 +379,13 @@ class ImageNotExtractableError (Exception): pass -def _get_pil_mode(colorspace, bpp): +def _get_pil_mode(cs, bpp): # In theory, indexed (palettized) and ICC-based color spaces could be handled as well, but PDFium currently does not provide access to the palette or the ICC profile - if colorspace == pdfium_c.FPDF_COLORSPACE_DEVICEGRAY: + if cs == pdfium_c.FPDF_COLORSPACE_DEVICEGRAY: return "1" if bpp == 1 else "L" - elif colorspace == pdfium_c.FPDF_COLORSPACE_DEVICERGB: + elif cs == pdfium_c.FPDF_COLORSPACE_DEVICERGB: return "RGB" - elif colorspace == pdfium_c.FPDF_COLORSPACE_DEVICECMYK: + elif cs == pdfium_c.FPDF_COLORSPACE_DEVICECMYK: return "CMYK" else: return None @@ -398,6 +393,8 @@ def _get_pil_mode(colorspace, bpp): def _extract_smart(image_obj, fb_format=None): + import PIL.Image + try: # TODO can we change PdfImage.get_data() to take an mmap, so the data could be written directly into a file rather than an in-memory array? data, info = _extract_direct(image_obj) From d7fc983301ca4d507e6b774a372c8289d152b4ff Mon Sep 17 00:00:00 2001 From: geisserml Date: Sat, 13 Jul 2024 14:11:51 +0200 Subject: [PATCH 103/140] changelog: add ref to selective lightness inversion --- docs/devel/changelog_staging.md | 2 +- req/converters.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 090507e1f..b38be9734 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -10,7 +10,7 @@ * Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). Instead, use `PdfPage.render()` with a loop or process pool. * Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`. * `PdfBitmap.from_pil()`: Removed `recopy` param. - * Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark them" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion. + * Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark them" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion, as is now implemented in pypdfium2's rendering CLI. - Pageobjects * Renamed `PdfObject.get_pos()` to `.get_bounds()`. * Renamed `PdfImage.get_size()` to `.get_px_size()`. diff --git a/req/converters.txt b/req/converters.txt index f1c7e2688..551d15c55 100644 --- a/req/converters.txt +++ b/req/converters.txt @@ -1,3 +1,3 @@ -# NOTE In order to use numpy, the rendering CLI further needs `opencv-python`, but we don't currently cover that internally. As the import is guarded, we don't have to require it here. +# NOTE In order to use numpy, the rendering CLI further needs `opencv-python[-headless]`, but we don't currently cover that internally. As the import is guarded, we don't have to require it here. pillow numpy From 78997587ea9df59fa6b1293923e552e1446bdf57 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sat, 13 Jul 2024 17:18:12 +0200 Subject: [PATCH 104/140] Do engine imports in parent process with fork context --- src/pypdfium2/_cli/render.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index 3cf6ecf20..ef43d66a7 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -320,19 +320,16 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): return dst_image -def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine): - - if extra_init: - extra_init() +def _render_parallel_init(logging_init, engine_init, input, password, may_init_forms, kwargs, engine): + logging_init() logger.info(f"Initializing data for process {os.getpid()}") + engine_init() pdf = pdfium.PdfDocument(input, password=password, autoclose=True) if may_init_forms: pdf.init_forms() - engine.do_imports() - global ProcObjs ProcObjs = (pdf, kwargs, engine) @@ -347,6 +344,8 @@ def _render_parallel_job(i): global ProcObjs _render_job(i, *ProcObjs) +def _do_nothing(): pass + # TODO turn into a python-usable API yielding output paths as they are written def main(args): @@ -429,10 +428,15 @@ def main(args): if args.parallel_map: map_attr = args.parallel_map - extra_init = (setup_logging if args.parallel_strategy in ("spawn", "forkserver") else None) + if args.parallel_strategy == "fork": + logging_init, engine_init = _do_nothing, _do_nothing + engine.do_imports() + else: + logging_init, engine_init = setup_logging, engine.do_imports + pool_kwargs = dict( initializer = _render_parallel_init, - initargs = (extra_init, pdf._input, args.password, args.draw_forms, kwargs, engine), + initargs = (logging_init, engine_init, pdf._input, args.password, args.draw_forms, kwargs, engine), ) n_procs = min(args.processes, len(args.pages)) From 9d715cf74d1f5e4184c35cfea256861af1ddda88 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sat, 13 Jul 2024 19:19:15 +0200 Subject: [PATCH 105/140] Use LazyLoader for deferred top-level imports This allows us to avoid imports in functions, which potentially means in loops. Not changing the renderer CLI as the engine.do_imports() strategy seems nice. See also https://gist.github.com/mara004/6915e904797916b961e9c53b4fc874ec for prior research on the subject of deferred imports. --- README.md | 2 +- docs/devel/changelog_staging.md | 1 + src/pypdfium2/_cli/imgtopdf.py | 10 +++------ src/pypdfium2/_cli/render.py | 10 +++++---- src/pypdfium2/_helpers/bitmap.py | 17 ++++++-------- src/pypdfium2/_helpers/pageobjects.py | 6 ++--- src/pypdfium2/_utils.py | 32 +++++++++++++++++++++++++++ 7 files changed, 53 insertions(+), 25 deletions(-) create mode 100644 src/pypdfium2/_utils.py diff --git a/README.md b/README.md index 7933c090b..0b6916c37 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ However, some optional support model features need additional packages: * [`NumPy`](https://numpy.org/doc/stable/index.html) is a library for scientific computing. Similar to `Pillow`, pypdfium2 provides helpers to get a numpy array view of a raw bitmap. * [`opencv-python`](https://github.com/opencv/opencv-python) (module `cv2`) is an imaging library built around numpy arrays. It can be used in the rendering CLI to save with pypdfium2's numpy adapter. -pypdfium2 tries to defer imports of optional dependencies to the scopes where they are actually accessed, so there should be no startup overhead if you don't use them. +pypdfium2 tries to defer imports of optional dependencies until they are actually needed, so there should be no startup overhead if you don't use them. ### Setup Magic diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index b38be9734..7cd089d6c 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -26,6 +26,7 @@ - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. - If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype. - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`. +- Improved startup performance by deferring imports of optional dependencies to the point where they are actually needed, to avoid overhead if you do not use them. - Simplified version impl (no API change expected). *Project* diff --git a/src/pypdfium2/_cli/imgtopdf.py b/src/pypdfium2/_cli/imgtopdf.py index 2238267ac..b43ec09c8 100644 --- a/src/pypdfium2/_cli/imgtopdf.py +++ b/src/pypdfium2/_cli/imgtopdf.py @@ -5,7 +5,8 @@ from pathlib import Path import pypdfium2._helpers as pdfium - +from pypdfium2._utils import deferred_import +PIL_Image = deferred_import("PIL.Image") def attach(parser): parser.add_argument( @@ -29,11 +30,6 @@ def attach(parser): def main(args): - try: - import PIL.Image - except ImportError: - PIL = None # JPEG can be convered without PIL - # Rudimentary image to PDF conversion (testing / proof of concept) # Due to limitations in PDFium's public API, this function may be inefficient/lossy for non-JPEG input. # The technically best available open-source tool for image to PDF conversion is probably img2pdf (although its code style can be regarded as displeasing). @@ -48,7 +44,7 @@ def main(args): if fp.suffix.lower() in (".jpg", ".jpeg"): image_obj.load_jpeg(fp, inline=args.inline) else: - pil_image = PIL.Image.open(fp) + pil_image = PIL_Image.open(fp) bitmap = pdfium.PdfBitmap.from_pil(pil_image) pil_image.close() image_obj.set_bitmap(bitmap) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index ef43d66a7..f2778a57a 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -20,6 +20,7 @@ BooleanOptionalAction, ) +have_pil = find_spec("PIL") is not None have_cv2 = find_spec("cv2") is not None logger = logging.getLogger(__name__) @@ -272,11 +273,11 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images) class NumpyCV2Engine (SavingEngine): - @staticmethod - def do_imports(): + def do_imports(self): global cv2, np import cv2 - import numpy as np + if self.postproc_kwargs["exclude_images"]: + import numpy as np def _saving_hook(self, out_path, bitmap, page, postproc_kwargs): np_array = bitmap.to_numpy() @@ -367,7 +368,8 @@ def main(args): # numpy+cv2 is much faster for PNG, and PIL faster for JPG, but this might simply be due to different encoding defaults if args.engine_cls is None: - if have_cv2 != None and args.format == "png": + assert have_pil or have_cv2, "Either pillow or numpy+cv2 must be installed for rendering CLI." + if (not have_pil) or (have_cv2 and args.format == "png"): args.engine_cls = NumpyCV2Engine else: args.engine_cls = PILEngine diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index 1d17d1e62..c6c75fa0c 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -8,6 +8,9 @@ import pypdfium2.raw as pdfium_c import pypdfium2.internal as pdfium_i from pypdfium2._helpers.misc import PdfiumError +from pypdfium2._utils import deferred_import +numpy = deferred_import("numpy") +PIL_Image = deferred_import("PIL.Image") logger = logging.getLogger(__name__) @@ -205,8 +208,6 @@ def to_numpy(self): # https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray - import numpy - array = numpy.ndarray( # layout: row major shape = (self.height, self.width, self.n_channels), @@ -234,10 +235,8 @@ def to_pil(self): # https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.frombuffer # https://pillow.readthedocs.io/en/stable/handbook/writing-your-own-image-plugin.html#the-raw-decoder - import PIL.Image - dest_mode = pdfium_i.BitmapTypeToStrReverse[self.format] - image = PIL.Image.frombuffer( + image = PIL_Image.frombuffer( dest_mode, # target color format (self.width, self.height), # size self.buffer, # buffer @@ -294,8 +293,6 @@ def get_posconv(self, page): def _pil_convert_for_pdfium(pil_image): - import PIL.Image - if pil_image.mode == "1": pil_image = pil_image.convert("L") elif pil_image.mode.startswith("RGB"): @@ -308,14 +305,14 @@ def _pil_convert_for_pdfium(pil_image): # convert RGB(A/X) to BGR(A) for PDFium if pil_image.mode == "RGB": r, g, b = pil_image.split() - pil_image = PIL.Image.merge("RGB", (b, g, r)) + pil_image = PIL_Image.merge("RGB", (b, g, r)) elif pil_image.mode == "RGBA": r, g, b, a = pil_image.split() - pil_image = PIL.Image.merge("RGBA", (b, g, r, a)) + pil_image = PIL_Image.merge("RGBA", (b, g, r, a)) elif pil_image.mode == "RGBX": # technically the x channel may be unnecessary, but preserve what the caller passes in r, g, b, x = pil_image.split() - pil_image = PIL.Image.merge("RGBX", (b, g, r, x)) + pil_image = PIL_Image.merge("RGBX", (b, g, r, x)) return pil_image diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 409bbc1b5..084babb60 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -12,6 +12,8 @@ from pypdfium2._helpers.misc import PdfiumError from pypdfium2._helpers.matrix import PdfMatrix from pypdfium2._helpers.bitmap import PdfBitmap +from pypdfium2._utils import deferred_import +PIL_Image = deferred_import("PIL.Image") class PdfObject (pdfium_i.AutoCloseable): @@ -393,8 +395,6 @@ def _get_pil_mode(cs, bpp): def _extract_smart(image_obj, fb_format=None): - import PIL.Image - try: # TODO can we change PdfImage.get_data() to take an mmap, so the data could be written directly into a file rather than an in-memory array? data, info = _extract_direct(image_obj) @@ -406,7 +406,7 @@ def _extract_smart(image_obj, fb_format=None): format = info.format if format == "raw": metadata = info.metadata - pil_image = PIL.Image.frombuffer( + pil_image = PIL_Image.frombuffer( info.mode, (metadata.width, metadata.height), image_obj.get_data(decode_simple=True), diff --git a/src/pypdfium2/_utils.py b/src/pypdfium2/_utils.py new file mode 100644 index 000000000..a1c459281 --- /dev/null +++ b/src/pypdfium2/_utils.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2024 geisserml +# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +import sys +import importlib.util + + +def deferred_import(modpath): + + # FIXME If modpath points to a submodule, the parent module will be loaded immediately when this function is called. This is a limitation of the find_spec() importlib API used here. However, this may still be useful if the parent is a mere namespace package that does not contain anything expensive, as in the case of PIL. + + module = sys.modules.get(modpath, None) + if module is not None: + return module # shortcut + + # assuming an optional dependency + # returning None will simply let it fail with an AttributeError when attempting to access the module + try: + spec = importlib.util.find_spec(modpath) + except ModuleNotFoundError: + return None + if spec is None: + return None + + # see https://docs.python.org/3/library/importlib.html#implementing-lazy-imports + loader = importlib.util.LazyLoader(spec.loader) + spec.loader = loader + module = importlib.util.module_from_spec(spec) + sys.modules[modpath] = module + loader.exec_module(module) + + return module From db65e002e183c2d9eb2fe4ea5209f2b5b9906e2d Mon Sep 17 00:00:00 2001 From: geisserml Date: Sat, 13 Jul 2024 21:44:36 +0200 Subject: [PATCH 106/140] Consistently use unary operator for inversion --- src/pypdfium2/_cli/render.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index f2778a57a..e1b278717 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -291,7 +291,7 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): if invert_lightness: if bitmap.format == pdfium_r.FPDFBitmap_Gray: - dst_image = 255 - src_image + dst_image = ~src_image else: if bitmap.rev_byteorder: From 7803b273c62d7d134a170d27091b186b53ad33f7 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sat, 13 Jul 2024 22:02:35 +0200 Subject: [PATCH 107/140] style --- src/pypdfium2/_cli/render.py | 14 +++++++------- src/pypdfium2/_helpers/bitmap.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index e1b278717..9df642ae2 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -13,7 +13,7 @@ import pypdfium2._helpers as pdfium import pypdfium2.internal as pdfium_i -import pypdfium2.raw as pdfium_r +import pypdfium2.raw as pdfium_c from pypdfium2._cli._parsers import ( add_input, get_input, setup_logging, @@ -26,9 +26,9 @@ def _bitmap_wrapper_foreign_simple(width, height, format, *args, **kwargs): - if format == pdfium_r.FPDFBitmap_BGRx: + if format == pdfium_c.FPDFBitmap_BGRx: use_alpha = False - elif format == pdfium_r.FPDFBitmap_BGRA: + elif format == pdfium_c.FPDFBitmap_BGRA: use_alpha = True else: raise RuntimeError(f"Cannot create foreign_simple bitmap with bitmap type {pdfium_i.BitmapTypeToStr[format]}.") @@ -260,7 +260,7 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images) dst_image = dst_image.filter(cls._get_linv_lut()) if exclude_images: # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates - image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) + image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1)) if len(image_objs) > 0: mask = PIL.Image.new("1", src_image.size) draw = PIL.ImageDraw.Draw(mask) @@ -290,7 +290,7 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): if invert_lightness: - if bitmap.format == pdfium_r.FPDFBitmap_Gray: + if bitmap.format == pdfium_c.FPDFBitmap_Gray: dst_image = ~src_image else: @@ -308,9 +308,9 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): dst_image = cv2.cvtColor(dst_image, convert_from) if exclude_images: - assert bitmap.format != pdfium_r.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2" + assert bitmap.format != pdfium_c.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2" posconv = bitmap.get_posconv(page) - image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1)) + image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1)) if len(image_objs) > 0: mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8) for obj in image_objs: diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index c6c75fa0c..a67c3c6d3 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -144,7 +144,7 @@ def new_foreign(cls, width, height, format, rev_byteorder=False, force_packed=Fa Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by PDFium. There may be a padding of unused bytes at line end, unless *force_packed=True* is given. - Note that is encouraged to prefer :meth:`.new_native`. + Note that it is recommended to prefer :meth:`.new_native`. """ stride = width * pdfium_i.BitmapTypeToNChannels[format] if force_packed else 0 raw = pdfium_c.FPDFBitmap_CreateEx(width, height, format, None, stride) @@ -156,7 +156,7 @@ def new_foreign_simple(cls, width, height, use_alpha, rev_byteorder=False): """ Create a new bitmap using :func:`FPDFBitmap_Create`. The buffer is allocated by PDFium, and supposed to be packed (i. e. no gap of unused bytes between lines). - Note that it is encouraged to prefer :meth:`.new_native`. + Note that it is recommended to prefer :meth:`.new_native`. """ raw = pdfium_c.FPDFBitmap_Create(width, height, use_alpha) return cls.from_raw(raw, rev_byteorder) From b495a1ff65cf7de53070596382c764a2c7c6624f Mon Sep 17 00:00:00 2001 From: geisserml Date: Sat, 13 Jul 2024 22:10:26 +0200 Subject: [PATCH 108/140] add task --- src/pypdfium2/_cli/render.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index 9df642ae2..2db39f791 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -259,7 +259,8 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images) else: dst_image = dst_image.filter(cls._get_linv_lut()) if exclude_images: - # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates + # FIXME Not sure how to translate XObject to page coordinates. pdfium does not seem to provide an API for this, so we're currently unable to descend into XObjects. + # FIXME We'd also like to take into account alpha masks, but this may be difficult as long as pdfium does not expose them directly. image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1)) if len(image_objs) > 0: mask = PIL.Image.new("1", src_image.size) From e45150abe8319f86ca1b3af705b86354ae866e91 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 14 Jul 2024 14:14:56 +0200 Subject: [PATCH 109/140] Update some wordings --- README.md | 37 +++++++++++++-------------------- docs/devel/changelog_staging.md | 4 ++-- src/pypdfium2/_cli/render.py | 4 ++-- src/pypdfium2/_utils.py | 2 +- src/pypdfium2/version.py | 6 +++--- 5 files changed, 23 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 0b6916c37..2c9f4b1f5 100644 --- a/README.md +++ b/README.md @@ -98,8 +98,6 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct See [Setup Magic](#setup-magic) for details. - Support for source installs (esp. with self-built/system pdfium) is limited, as their integrity somewhat depends on a correctly acting caller. - Installing an `sdist` does not implicitly trigger a sourcebuild if no pre-built binary is available. We prefer to let callers decide consciously what to do, and run the build script without pip encapsulation. Relevant pip options: @@ -107,6 +105,8 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct * `-e`: Install in editable mode, so the installation points to the source tree. This way, changes directly take effect without needing to re-install. Recommended for development. * `--no-build-isolation`: Do not isolate setup in a virtual env; use the main env instead. This renders `pyproject.toml [build-system]` inactive, so setup deps must be prepared by caller. Useful to install custom versions of setup deps, or as speedup when installing repeatedly. + That said, do not expect us to provide much guidance with source installs, or to support the result, as this may be a crafty process, and we can't be sure whether it was done correctly (e.g. ABI safety, ctypesgen version used, etc.). + [^pdfium_buildsystem]: This means pdfium may not compile on arbitrary hosts. The script is limited to build hosts supported by Google's toolchain. Ideally, we'd need an alternative build system that runs with system packages instead. @@ -180,7 +180,7 @@ As of this writing, pypdfium2 does not require any mandatory runtime dependencie However, some optional support model features need additional packages: * [`Pillow`](https://pillow.readthedocs.io/en/stable/) (module `PIL`) is a pouplar imaging library for Python. pypdfium2 provides convenience adapters to translate between raw bitmap buffers and PIL images. It also uses PIL for some command-line functionality (e.g. image saving). -* [`NumPy`](https://numpy.org/doc/stable/index.html) is a library for scientific computing. Similar to `Pillow`, pypdfium2 provides helpers to get a numpy array view of a raw bitmap. +* [`NumPy`](https://numpy.org/doc/stable/index.html) is a library for scientific computing. As with `Pillow`, pypdfium2 provides helpers to get a numpy array view of a raw bitmap. * [`opencv-python`](https://github.com/opencv/opencv-python) (module `cv2`) is an imaging library built around numpy arrays. It can be used in the rendering CLI to save with pypdfium2's numpy adapter. pypdfium2 tries to defer imports of optional dependencies until they are actually needed, so there should be no startup overhead if you don't use them. @@ -649,7 +649,7 @@ Usage should be largely self-explanatory, assuming a minimum of familiarity with ## Licensing -PDFium and pypdfium2 are available by the terms and conditions of either [`Apache-2.0`](LICENSES/Apache-2.0.txt) or [`BSD-3-Clause`](LICENSES/BSD-3-Clause.txt), at your choice. +pypdfium2 is available by the terms and conditions of either [`Apache-2.0`](LICENSES/Apache-2.0.txt) or [`BSD-3-Clause`](LICENSES/BSD-3-Clause.txt), at your choice. Various other open-source licenses apply to dependencies bundled with PDFium. Verbatim copies of their respective licenses are contained in the file [`LicenseRef-PdfiumThirdParty.txt`](LICENSES/LicenseRef-PdfiumThirdParty.txt), which also has to be shipped with binary redistributions. Documentation and examples of pypdfium2 are licensed under [`CC-BY-4.0`](LICENSES/CC-BY-4.0.txt). @@ -657,16 +657,13 @@ pypdfium2 complies with the [reuse standard](https://reuse.software/spec/) by in To the author's knowledge, pypdfium2 is one of the rare Python libraries that are capable of PDF rendering while not being covered by copyleft licenses (such as the `GPL`).[^liberal_pdf_renderlibs] -As of early 2023, a single developer is author and rightsholder of the code base (apart from a few minor [code contributions](https://github.com/pypdfium2-team/pypdfium2/graphs/contributors)). - [^liberal_pdf_renderlibs]: The only other liberal-licensed PDF rendering libraries known to the author are [`pdf.js`](https://github.com/mozilla/pdf.js/) (JavaScript) and [`Apache PDFBox`](https://github.com/apache/pdfbox) (Java), but python bindings packages don't exist yet or are unsatisfactory. However, we wrote some gists that show it'd be possible in principle: [pdfbox](https://gist.github.com/mara004/51c3216a9eabd3dcbc78a86d877a61dc) (+ [setup](https://gist.github.com/mara004/881d0c5a99b8444fd5d1d21a333b70f8)), [pdfjs](https://gist.github.com/mara004/87276da4f8be31c80c38036c6ab667d7). -## Issues +## Issues / Contributions While using pypdfium2, you might encounter bugs or missing features. -In this case, feel free to open an issue or discuss thread. If applicable, include details such as tracebacks, OS and CPU type, as well as the versions of pypdfium2 and used dependencies. -__However, please note our [response policy](#contributions).__ +In this case, feel free to open an issue or discussion thread. If applicable, include details such as tracebacks, OS and CPU type, as well as the versions of pypdfium2 and used dependencies. Roadmap: * pypdfium2 @@ -679,6 +676,13 @@ Roadmap: * [pdfium-binaries](https://github.com/bblanchon/pdfium-binaries/issues): Binary builder. * [ctypesgen](https://github.com/ctypesgen/ctypesgen/issues): Bindings generator. +### Response policy + + +Given this is a volunteer open-source project, it is possible you may not get a response to your issue, or it may be closed without much feedback. Conversations may be locked if we feel like our attention is getting DDOSed. We may not have time to provide usage support. + +The same applies to Pull Requests. We will accept contributions only if we find them suitable. Do not reach out with a strong expectation to get your change merged; it is solely up to the repository owner to decide if and when a PR will be merged, and we are free to silently reject PRs we do not like. + ### Known limitations #### Risk of unknown object lifetime violations @@ -704,17 +708,6 @@ Also, while ABI bindings tend to be more convenient, they have some technical dr ## Development -### Contributions - - -> We may accept contributions, but only if our code quality expectations are met. - -__Policy__: -* We may not respond to your issue or PR. -* We may close an issue or PR without much feedback. -* We may lock discussions or contributions if our attention is getting DDOSed. -* We may not provide much usage support. - ### Long lines The pypdfium2 codebase does not hard wrap long lines. @@ -877,7 +870,7 @@ Inspired by *wowpng*, the first known proof of concept Python binding to PDFium *pypdfium-reboot* then added a script to automate binary deployment and bindings generation to simplify regular updates. However, it was still not platform specific. pypdfium2 is a full rewrite of *pypdfium-reboot* to build platform-specific wheels and consolidate the setup scripts. Further additions include ... -* A CI workflow to automatically release new wheels every Tuesday -* Support models that conveniently wrap the raw PDFium/ctypes API +* A CI workflow to automatically release new wheels at a defined schedule +* Convenience support models that wrap the raw PDFium/ctypes API * Test code * A script to build PDFium from source diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 7cd089d6c..89053d6e6 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -15,7 +15,7 @@ * Renamed `PdfObject.get_pos()` to `.get_bounds()`. * Renamed `PdfImage.get_size()` to `.get_px_size()`. * `PdfImage.extract()`: Removed `fb_render` param because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place. -- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest. +- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest is None` and a dest with unknown mode. - `get_text_range()`: Removed implicit translation of default calls to `get_text_bounded()`, as pdfium reverted `FPDFText_GetText()` to UCS-2, which resolves the allocation concern. However, callers are encouraged to explicitly use `get_text_bounded()` for full Unicode support. - Removed legacy version flags. @@ -27,7 +27,7 @@ - If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype. - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`. - Improved startup performance by deferring imports of optional dependencies to the point where they are actually needed, to avoid overhead if you do not use them. -- Simplified version impl (no API change expected). +- Simplified version classes (no API change expected). *Project* - Merged `tests_old/` back into `tests/`. diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index 2db39f791..b87233f2a 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -259,8 +259,8 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images) else: dst_image = dst_image.filter(cls._get_linv_lut()) if exclude_images: - # FIXME Not sure how to translate XObject to page coordinates. pdfium does not seem to provide an API for this, so we're currently unable to descend into XObjects. - # FIXME We'd also like to take into account alpha masks, but this may be difficult as long as pdfium does not expose them directly. + # FIXME pdfium does not seem to provide APIs to translate XObject to page coordinates, so not sure how to handle images nested in XObjects. + # FIXME we'd also like to take alpha masks into account, but this may be difficult as long as pdfium does not expose them directly. image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1)) if len(image_objs) > 0: mask = PIL.Image.new("1", src_image.size) diff --git a/src/pypdfium2/_utils.py b/src/pypdfium2/_utils.py index a1c459281..d968f1c7f 100644 --- a/src/pypdfium2/_utils.py +++ b/src/pypdfium2/_utils.py @@ -7,7 +7,7 @@ def deferred_import(modpath): - # FIXME If modpath points to a submodule, the parent module will be loaded immediately when this function is called. This is a limitation of the find_spec() importlib API used here. However, this may still be useful if the parent is a mere namespace package that does not contain anything expensive, as in the case of PIL. + # FIXME If modpath points to a submodule, the parent module will be loaded immediately when this function is called, which is a limitation of the find_spec() importlib API used here. However, this may still be useful if the parent is a mere namespace package that does not contain anything expensive, as in the case of PIL. module = sys.modules.get(modpath, None) if module is not None: diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py index 78006c11b..43550d423 100644 --- a/src/pypdfium2/version.py +++ b/src/pypdfium2/version.py @@ -10,7 +10,7 @@ import pypdfium2_raw -class _abc_version: +class _version_interface: def __init__(self): @@ -45,7 +45,7 @@ def _craft_desc(self, suffix=[]): return desc -class _version_pypdfium2 (_abc_version): +class _version_pypdfium2 (_version_interface): _FILE = Path(__file__).parent / "version.json" _TAG_FIELDS = ("major", "minor", "patch") @@ -64,7 +64,7 @@ def _hook(self): self.desc += "@editable" -class _version_pdfium (_abc_version): +class _version_pdfium (_version_interface): _FILE = Path(pypdfium2_raw.__file__).parent / "version.json" _TAG_FIELDS = ("major", "minor", "build", "patch") From d3e9a43da4e2542bd66923f262cd0c6824201657 Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 16 Jul 2024 14:18:30 +0200 Subject: [PATCH 110/140] readme: slightly update wording in raw api guide --- README.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 2c9f4b1f5..09aae76de 100644 --- a/README.md +++ b/README.md @@ -332,14 +332,14 @@ Here are some examples of using the support model API. ### Raw PDFium API -While helper classes conveniently wrap the raw PDFium API, it may still be accessed directly and is available in the namespace `pypdfium2.raw`. Lower-level helpers that may aid with using the raw API are provided in `pypdfium2.internal`. +While helper classes conveniently wrap the raw PDFium API, it may still be accessed directly and is available in the namespace `pypdfium2.raw`. Lower-level utilities that may aid with using the raw API are provided in `pypdfium2.internal`. ```python import pypdfium2.raw as pdfium_c import pypdfium2.internal as pdfium_i ``` -Since PDFium is a large library, many components are not covered by helpers yet. You may seamlessly interact with the raw API while still using helpers where available. When used as ctypes function parameter, helper objects automatically resolve to the underlying raw object (but you may still access it explicitly if desired): +Since PDFium is a large library, many components are not covered by helpers yet. However, as helpers expose their underlying raw objects, you may seamlessly integrate raw APIs while using helpers as available. When passed as ctypes function parameter, helpers automatically resolve to the raw object handle (but you may still access it explicitly if desired): ```python permission_flags = pdfium_c.FPDF_GetDocPermission(pdf.raw) # explicit permission_flags = pdfium_c.FPDF_GetDocPermission(pdf) # implicit @@ -347,14 +347,14 @@ permission_flags = pdfium_c.FPDF_GetDocPermission(pdf) # implicit For PDFium docs, please look at the comments in its [public header files](https://pdfium.googlesource.com/pdfium/+/refs/heads/main/public/).[^pdfium_docs] A large variety of examples on how to interface with the raw API using [`ctypes`](https://docs.python.org/3/library/ctypes.html) is already provided with [support model source code](src/pypdfium2/_helpers). -Nonetheless, the following guide may be helpful to get started with the raw API, especially for developers who are not familiar with `ctypes` yet. +Nonetheless, the following guide may be helpful to get started with the raw API, if you are not familiar with `ctypes` yet. [^pdfium_docs]: Unfortunately, no recent HTML-rendered docs are available for PDFium at the moment. * In general, PDFium functions can be called just like normal Python functions. - However, parameters may only be passed positionally, i. e. it is not possible to use keyword arguments. + However, parameters may only be passed positionally, i.e. it is not possible to use keyword arguments. There are no defaults, so you always need to provide a value for each argument. ```python # arguments: filepath (bytes), password (bytes|None) @@ -369,12 +369,12 @@ Nonetheless, the following guide may be helpful to get started with the raw API, FPDF_LoadDocument.argtypes = [FPDF_STRING, FPDF_BYTESTRING] FPDF_LoadDocument.restype = FPDF_DOCUMENT ``` - Python `bytes` are converted to `FPDF_STRING` by ctypes autoconversion. + Python `bytes` are converted to `FPDF_STRING` (which is an alias to `POINTER(c_char)`, rps. `char*` in C notation) by ctypes autoconversion. When passing a string to a C function, it must always be null-terminated, as the function merely receives a pointer to the first item and then continues to read memory until it finds a null terminator. [^bindings_decl]: From the auto-generated bindings file. We maintain a reference copy at `autorelease/bindings.py`. Or if you have an editable install, there will also be `src/pypdfium2_raw/bindings.py`. -* While some functions are quite easy to use, things soon get more complex. +* While some functions are quite easy to use, things may soon get more peculiar. First of all, function parameters are not only used for input, but also for output: ```python # Initialise an integer object (defaults to 0) @@ -406,7 +406,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API, ``` * For string output parameters, callers needs to provide a sufficiently long, pre-allocated buffer. - This may work differently depending on what type the function requires, which encoding is used, whether the number of bytes or characters is returned, and whether space for a null terminator is included or not. Carefully review the documentation for the function in question to fulfill its requirements. + This may work differently depending on what type the function requires, which encoding is used, whether the number of bytes or characters is returned, and whether space for a null terminator is included or not. Carefully review the documentation of the function in question to fulfill its requirements. Example A: Getting the title string of a bookmark. ```python @@ -446,8 +446,8 @@ Nonetheless, the following guide may be helpful to get started with the raw API, * Not only are there different ways of string output that need to be handled according to the requirements of the function in question. String input, too, can work differently depending on encoding and type. - We have already discussed `FPDF_LoadDocument()`, which takes a UTF-8 encoded string as `char *`. - A different examples is `FPDFText_FindStart()`, which needs a UTF-16LE encoded string, given as `unsigned short *`: + We have already discussed `FPDF_LoadDocument()`, which takes a UTF-8 encoded string as `char*`. + A different examples is `FPDFText_FindStart()`, which needs a UTF-16LE encoded string, given as `unsigned short*`: ```python # (Assuming `text` is a str and `textpage` an FPDF_TEXTPAGE) # Add the null terminator and encode as UTF-16LE @@ -459,7 +459,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API, * Leaving strings, let's suppose you have a C memory buffer allocated by PDFium and wish to read its data. PDFium will provide you with a pointer to the first item of the byte array. - To access the data, you'll want to re-interpret the pointer using `ctypes.cast()` to encompass the whole array: + To access the data, you'll want to re-interpret the pointer with `ctypes.cast()` to encompass the whole array: ```python # (Assuming `bitmap` is an FPDF_BITMAP and `size` is the expected number of bytes in the buffer) buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(bitmap) @@ -480,7 +480,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API, n_bytes = py_buffer.readinto(buffer_ptr.contents) # returns the number of bytes read ``` -* If you wish to check whether two objects returned by PDFium are the same, the `is` operator won't help because `ctypes` does not have original object return (OOR), i. e. new, equivalent Python objects are created each time, although they might represent one and the same C object.[^ctypes_no_oor] +* If you wish to check whether two objects returned by PDFium are the same, the `is` operator won't help because `ctypes` does not have original object return (OOR), i.e. new, equivalent Python objects are created each time, although they might represent one and the same C object.[^ctypes_no_oor] That's why you'll want to use `ctypes.addressof()` to get the memory addresses of the underlying C object. For instance, this is used to avoid infinite loops on circular bookmark references when iterating through the document outline: ```python @@ -504,7 +504,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API, [^callback_usecases]: e. g. incremental read/write, management of progressive tasks, ... - Example: Loading a document from a Python buffer. This way, file access can be controlled in Python while the whole data does not need to be in memory at once. + Example: Loading a document from a Python buffer. This way, file access can be controlled in Python while the data does not need to be in memory at once. ```python import os @@ -542,7 +542,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API, * When using the raw API, special care needs to be taken regarding object lifetime, considering that Python may garbage collect objects as soon as their reference count reaches zero. However, the interpreter has no way of magically knowing how long the underlying resources of a Python object might still be needed on the C side, so measures need to be taken to keep such objects referenced until PDFium does not depend on them anymore. - If resources need to remain valid after the time of a function call, PDFium docs usually indicate this clearly. Ignoring requirements on object lifetime will lead to memory corruption (commonly resulting in a segfault). + If resources need to remain valid after the time of a function call, PDFium docs usually indicate this clearly. Ignoring requirements on object lifetime will lead to memory corruption (commonly resulting in a segfault sooner or later). For instance, the docs on `FPDF_LoadCustomDocument()` state that > The application must keep the file resources |pFileAccess| points to valid until the returned FPDF_DOCUMENT is closed. |pFileAccess| itself does not need to outlive the FPDF_DOCUMENT. From 86bc8b19173eb179a60d8b9d5a85bf5dbb5d4b8d Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 16 Jul 2024 14:38:52 +0200 Subject: [PATCH 111/140] Add reference to VikParuchuri's `pdftext` --- src/pypdfium2/_helpers/textpage.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py index 19cdb4ac6..64e4b33cb 100644 --- a/src/pypdfium2/_helpers/textpage.py +++ b/src/pypdfium2/_helpers/textpage.py @@ -19,6 +19,11 @@ class PdfTextPage (pdfium_i.AutoCloseable): """ Text page helper class. + Hint: + (py)pdfium itself does not implement layout analysis, such as detecting words/lines/paragraphs. + However, there is a fancy third-party extension to pypdfium2 that fills this gap: + https://github.com/VikParuchuri/pdftext + Attributes: raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle. From f0dbf9c0bf5c729ccab7852227149e53e77d2691 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 21 Jul 2024 14:02:16 +0200 Subject: [PATCH 112/140] version: clean up trailer --- src/pypdfium2/version.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py index 43550d423..860aae695 100644 --- a/src/pypdfium2/version.py +++ b/src/pypdfium2/version.py @@ -165,5 +165,3 @@ def _hook(self): flags (tuple[str]): Tuple of pdfium feature flags. Empty for default build. (V8, XFA) for pdfium-binaries V8 build. """ - -# ----- From f33fa366ecf6792bbe969f2bafeed580fee7e64b Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 21 Jul 2024 18:10:55 +0200 Subject: [PATCH 113/140] readme: improve raw api Avoid declaring UTF-16 as "2 bytes per character", because a visual character could be composed of a surrogate pair of 4 bytes. So this is not the number of visual characters, but the number of units, where the number of bytes per unit corresponds to the size of the data type used. --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 09aae76de..0af837329 100644 --- a/README.md +++ b/README.md @@ -369,7 +369,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API, FPDF_LoadDocument.argtypes = [FPDF_STRING, FPDF_BYTESTRING] FPDF_LoadDocument.restype = FPDF_DOCUMENT ``` - Python `bytes` are converted to `FPDF_STRING` (which is an alias to `POINTER(c_char)`, rps. `char*` in C notation) by ctypes autoconversion. + Python `bytes` are converted to `FPDF_STRING` by ctypes autoconversion. This works because `FPDF_STRING` is actually an alias to `POINTER(c_char)` (i.e. `char*`), which is a primitive pointer type. When passing a string to a C function, it must always be null-terminated, as the function merely receives a pointer to the first item and then continues to read memory until it finds a null terminator. [^bindings_decl]: From the auto-generated bindings file. We maintain a reference copy at `autorelease/bindings.py`. Or if you have an editable install, there will also be `src/pypdfium2_raw/bindings.py`. @@ -411,14 +411,13 @@ Nonetheless, the following guide may be helpful to get started with the raw API, Example A: Getting the title string of a bookmark. ```python # (Assuming `bookmark` is an FPDF_BOOKMARK) - # First call to get the required number of bytes (not characters!), including space for a null terminator + # First call to get the required number of bytes (not units!), including space for a null terminator n_bytes = pdfium_c.FPDFBookmark_GetTitle(bookmark, None, 0) # Initialise the output buffer buffer = ctypes.create_string_buffer(n_bytes) # Second call with the actual buffer pdfium_c.FPDFBookmark_GetTitle(bookmark, buffer, n_bytes) - # Decode to string, cutting off the null terminator - # Encoding: UTF-16LE (2 bytes per character) + # Decode to string, cutting off the null terminator (encoding: UTF-16LE) title = buffer.raw[:n_bytes-2].decode("utf-16-le") ``` @@ -427,16 +426,17 @@ Nonetheless, the following guide may be helpful to get started with the raw API, # (Assuming `textpage` is an FPDF_TEXTPAGE and the boundary variables are set) # Store common arguments for the two calls args = (textpage, left, top, right, bottom) - # First call to get the required number of characters (not bytes!) - a possible null terminator is not included + # First call to get the required number of units (not bytes!) - a possible null terminator is not included n_chars = pdfium_c.FPDFText_GetBoundedText(*args, None, 0) # If no characters were found, return an empty string if n_chars <= 0: return "" - # Calculate the required number of bytes (UTF-16LE encoding again) + # Calculate the required number of bytes (encoding: UTF-16LE again) + # The function signature uses c_ushort, so 1 unit takes sizeof(c_ushort) == 2 bytes n_bytes = 2 * n_chars # Initialise the output buffer - this function can work without null terminator, so skip it buffer = ctypes.create_string_buffer(n_bytes) - # Re-interpret the type from char to unsigned short as required by the function + # Re-interpret the type from char to unsigned short* as required by the function buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort)) # Second call with the actual buffer pdfium_c.FPDFText_GetBoundedText(*args, buffer_ptr, n_chars) From c2aa668d5252ea911b4350d5bb81bae809fd1325 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 21 Jul 2024 18:32:34 +0200 Subject: [PATCH 114/140] Update a few docstrings --- src/pypdfium2/_helpers/document.py | 12 ++++++------ src/pypdfium2/_helpers/page.py | 2 +- src/pypdfium2/_helpers/textpage.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index abff23207..6511c2f96 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -317,7 +317,7 @@ def count_attachments(self): def get_attachment(self, index): """ Returns: - PdfAttachment: The attachment at *index* (zero-based). + PdfAttachment: The attachment at given index (zero-based). """ raw_attachment = pdfium_c.FPDFDoc_GetAttachment(self, index) if not raw_attachment: @@ -345,7 +345,7 @@ def new_attachment(self, name): def del_attachment(self, index): """ - Unlink the attachment at *index* (zero-based). + Unlink the attachment at given index (zero-based). It will be hidden from the viewer, but is still present in the file (as of PDFium 5418). Following attachments shift one slot to the left in the array representation used by PDFium's API. @@ -360,7 +360,7 @@ def del_attachment(self, index): def get_page(self, index): """ Returns: - PdfPage: The page at *index* (zero-based). + PdfPage: The page at given index (zero-based). Note: This calls ``FORM_OnAfterLoadPage()`` if the document has an active form env. In that case, note that closing the formenv would implicitly close the page. @@ -406,7 +406,7 @@ def new_page(self, width, height, index=None): def del_page(self, index): """ - Remove the page at *index* (zero-based). + Remove the page at given index (zero-based). It is recommended to close any open handles to the page before calling this method. """ # FIXME not sure how pdfium would behave if the caller tries to access a handle to a deleted page... @@ -447,7 +447,7 @@ def import_pages(self, pdf, pages=None, index=None): def get_page_size(self, index): """ Returns: - (float, float): Width and height in PDF canvas units of the page at *index* (zero-based). + (float, float): Width and height of the page at given index (zero-based), in PDF canvas units. """ size = pdfium_c.FS_SIZEF() ok = pdfium_c.FPDF_GetPageSizeByIndexF(self, index, size) @@ -459,7 +459,7 @@ def get_page_size(self, index): def get_page_label(self, index): """ Returns: - str: Label of the page at *index* (zero-based). + str: Label of the page at given index (zero-based). (A page label is essentially an alias that may be displayed instead of the page number.) """ n_bytes = pdfium_c.FPDF_GetPageLabel(self, index, None, 0) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index f992c723c..59342af44 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -202,7 +202,7 @@ def insert_obj(self, pageobj): """ Insert a pageobject into the page. - The pageobject must not belong to a page yet. If it belongs to a PDF, this page must be part of the PDF. + The pageobject must not belong to a page yet. If it belongs to a PDF, the target page must be part of that PDF. Position and form are defined by the object's matrix. If it is the identity matrix, the object will appear as-is on the bottom left corner of the page. diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py index 64e4b33cb..22dbc36a8 100644 --- a/src/pypdfium2/_helpers/textpage.py +++ b/src/pypdfium2/_helpers/textpage.py @@ -21,7 +21,7 @@ class PdfTextPage (pdfium_i.AutoCloseable): Hint: (py)pdfium itself does not implement layout analysis, such as detecting words/lines/paragraphs. - However, there is a fancy third-party extension to pypdfium2 that fills this gap: + However, there is a fancy third-party extension that fills this gap: https://github.com/VikParuchuri/pdftext Attributes: @@ -43,7 +43,7 @@ def parent(self): # AutoCloseable hook def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore"): """ - Extract text from given boundaries in PDF coordinates. + Extract text from given boundaries, in PDF canvas units. If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`. Parameters: From eb8b1b523545b61fdd1bf1ec5adc33fbb2eb59c9 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 21 Jul 2024 19:02:19 +0200 Subject: [PATCH 115/140] Rename "byte buffer" to "byte stream" --- src/pypdfium2/_helpers/document.py | 8 ++++---- src/pypdfium2/_helpers/pageobjects.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index 6511c2f96..a7b775e57 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -24,13 +24,13 @@ class PdfDocument (pdfium_i.AutoCloseable): Parameters: input_data (str | pathlib.Path | bytes | ctypes.Array | typing.BinaryIO | FPDF_DOCUMENT): - The input PDF given as file path, bytes, ctypes array, byte buffer, or raw PDFium document handle. - A byte buffer is defined as an object that implements ``seek() tell() read() readinto()``. + The input PDF given as file path, bytes, ctypes array, byte stream, or raw PDFium document handle. + A byte stream is defined as an object that implements ``seek() tell() read() readinto()``. password (str | None): A password to unlock the PDF, if encrypted. Otherwise, None or an empty string may be passed. If a password is given but the PDF is not encrypted, it will be ignored (as of PDFium 5418). autoclose (bool): - Whether byte buffer input should be automatically closed on finalization. + Whether byte stream input should be automatically closed on finalization. Raises: PdfiumError: Raised if the document failed to load. The exception is annotated with the reason reported by PDFium (via message and :attr:`~.PdfiumError.err_code`). @@ -219,7 +219,7 @@ def save(self, dest, version=None, flags=pdfium_c.FPDF_NO_INCREMENTAL): Parameters: dest (str | pathlib.Path | io.BytesIO): - File path or byte buffer the document shall be written to. + File path or byte stream the document shall be written to. version (int | None): The PDF version to use, given as an integer (14 for 1.4, 15 for 1.5, ...). If None (the default), PDFium will set a version automatically. diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 084babb60..01bf7768b 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -212,7 +212,7 @@ def load_jpeg(self, source, pages=None, inline=False, autoclose=True): Parameters: source (str | pathlib.Path | typing.BinaryIO): - Input JPEG, given as file path or readable byte buffer. + Input JPEG, given as file path or readable byte stream. pages (list[PdfPage] | None): If replacing an image, pass in a list of loaded pages that might contain it, to update their cache. (The same image may be shown multiple times in different transforms across a PDF.) @@ -230,7 +230,7 @@ def load_jpeg(self, source, pages=None, inline=False, autoclose=True): elif pdfium_i.is_buffer(source, "r"): buffer = source else: - raise ValueError(f"Cannot load JPEG from {source} - not a file path or byte buffer.") + raise ValueError(f"Cannot load JPEG from {source} - not a file path or byte stream.") bufaccess, to_hold = pdfium_i.get_bufreader(buffer) loader = { @@ -341,7 +341,7 @@ def get_filters(self, skip_simple=False): def extract(self, dest, *args, **kwargs): """ - Extract the image into an independently usable file or byte buffer, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits. + Extract the image into an independently usable file or byte stream, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits. This method can only extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly. Otherwise, the pixel data is decoded and re-encoded using :mod:`PIL`, which is slower and loses the original encoding. @@ -355,7 +355,7 @@ def extract(self, dest, *args, **kwargs): Parameters: dest (str | pathlib.Path | io.BytesIO): - File path prefix or byte buffer to which the image shall be written. + File path prefix or byte stream to which the image shall be written. fb_format (str): The image format to use in case it is necessary to (re-)encode the data. """ From d29435db98d2ea049688d81eea0c23f155b7060d Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 24 Jul 2024 17:25:05 +0200 Subject: [PATCH 116/140] doc nits The TODO is pointless because closing a pageobject that is part of a page would be a no-op, these are managed by pdfium anyway. --- src/pypdfium2/_helpers/bitmap.py | 2 +- src/pypdfium2/_helpers/document.py | 6 +++--- src/pypdfium2/_helpers/page.py | 2 -- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index a67c3c6d3..10e5389e1 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -23,7 +23,7 @@ class PdfBitmap (pdfium_i.AutoCloseable): Warning: ``bitmap.close()``, which frees the buffer of foreign bitmaps, is not validated for safety. - A bitmap must not be closed when other objects still depend on its buffer! + A bitmap must not be closed while other objects still depend on its buffer! Attributes: raw (FPDF_BITMAP): diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py index a7b775e57..4805f7a05 100644 --- a/src/pypdfium2/_helpers/document.py +++ b/src/pypdfium2/_helpers/document.py @@ -38,7 +38,7 @@ class PdfDocument (pdfium_i.AutoCloseable): Hint: * Documents may be used in a ``with``-block, closing the document on context manager exit. - This is recommended when *input_data* is a file path, to safely and immediately release the opened file handle. + This is recommended when *input_data* is a file path, to safely and immediately release the bound file handle. * :func:`len` may be called to get a document's number of pages. * Pages may be loaded using list index access. * Looping over a document will yield its pages from beginning to end. @@ -608,8 +608,8 @@ def as_pageobject(self): """ Returns: PdfObject: An independent pageobject representation of the XObject. - If multiple pageobjects are created from one XObject, they share resources. - Pageobjects created from an XObject remain valid after the XObject is closed. + If multiple pageobjects are created from an XObject, they share resources. + Returned pageobjects remain valid after the XObject is closed. """ raw_pageobj = pdfium_c.FPDF_NewFormObjectFromXObject(self) # not a child object (see above) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index 59342af44..cd451a67d 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -281,8 +281,6 @@ def get_objects(self, filter=None, max_depth=15, form=None, level=0): :class:`.PdfObject`: A pageobject. """ - # TODO close skipped objects explicitly ? - if form: count_objects = pdfium_c.FPDFFormObj_CountObjects get_object = pdfium_c.FPDFFormObj_GetObject From f15ac1b5b80a648cfff8cfbff29805c0998b9706 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 25 Jul 2024 14:09:49 +0200 Subject: [PATCH 117/140] fix typo --- docs/devel/changelog_staging.md | 2 +- src/pypdfium2/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 89053d6e6..6e2352600 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -10,7 +10,7 @@ * Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). Instead, use `PdfPage.render()` with a loop or process pool. * Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`. * `PdfBitmap.from_pil()`: Removed `recopy` param. - * Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark them" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion, as is now implemented in pypdfium2's rendering CLI. + * Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark theme" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion, as is now implemented in pypdfium2's rendering CLI. - Pageobjects * Renamed `PdfObject.get_pos()` to `.get_bounds()`. * Renamed `PdfImage.get_size()` to `.get_px_size()`. diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py index 860aae695..ec280f8dc 100644 --- a/src/pypdfium2/version.py +++ b/src/pypdfium2/version.py @@ -55,7 +55,7 @@ def _hook(self): self.tag = self._craft_tag() if self.beta is not None: self.tag += f"b{self.beta}" - + suffix = ["dirty"] if self.dirty else [] self.desc = self._craft_desc(suffix) if self.data_source != "git": From 4cda54c8c7d443bf1343832fc33bd9784f8a2056 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 1 Aug 2024 17:54:29 +0200 Subject: [PATCH 118/140] Update to new FPDFPageObj_TransformF() https://pdfium-review.googlesource.com/c/pdfium/+/121630 --- src/pypdfium2/_helpers/pageobjects.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 01bf7768b..7565b1598 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -145,7 +145,9 @@ def transform(self, matrix): Parameters: matrix (PdfMatrix): Multiply the pageobject's current transform matrix by this matrix. """ - pdfium_c.FPDFPageObj_Transform(self, *matrix.get()) + ok = pdfium_c.FPDFPageObj_TransformF(self, matrix) + if not ok: + raise PdfiumError("Failed to transform pageobject with matrix.") class PdfImage (PdfObject): From 7cc3cbe2b16961976879b344126448f0f2d63253 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 1 Aug 2024 18:11:48 +0200 Subject: [PATCH 119/140] Fix caller-side imports of deferred modules --- src/pypdfium2/_utils.py | 49 +++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/src/pypdfium2/_utils.py b/src/pypdfium2/_utils.py index d968f1c7f..48dd1099c 100644 --- a/src/pypdfium2/_utils.py +++ b/src/pypdfium2/_utils.py @@ -1,32 +1,39 @@ # SPDX-FileCopyrightText: 2024 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +# see https://gist.github.com/mara004/6915e904797916b961e9c53b4fc874ec for alternative approaches to deferred imports + import sys -import importlib.util +import importlib +import functools +if sys.version_info < (3, 8): + # NOTE This is not as good as a real cached property. + # https://github.com/penguinolog/backports.cached_property might be better. + def cached_property(func): + return property( functools.lru_cache(maxsize=1)(func) ) +else: + cached_property = functools.cached_property -def deferred_import(modpath): + +class _DeferredModule: - # FIXME If modpath points to a submodule, the parent module will be loaded immediately when this function is called, which is a limitation of the find_spec() importlib API used here. However, this may still be useful if the parent is a mere namespace package that does not contain anything expensive, as in the case of PIL. + # NOTE Attribute assigment will affect only the wrapper, not the actual module. - module = sys.modules.get(modpath, None) - if module is not None: - return module # shortcut + def __init__(self, modpath): + self._modpath = modpath - # assuming an optional dependency - # returning None will simply let it fail with an AttributeError when attempting to access the module - try: - spec = importlib.util.find_spec(modpath) - except ModuleNotFoundError: - return None - if spec is None: - return None + def __repr__(self): + return f"" - # see https://docs.python.org/3/library/importlib.html#implementing-lazy-imports - loader = importlib.util.LazyLoader(spec.loader) - spec.loader = loader - module = importlib.util.module_from_spec(spec) - sys.modules[modpath] = module - loader.exec_module(module) + @cached_property + def _module(self): + # print("actually importing module...") + return importlib.import_module(self._modpath) - return module + def __getattr__(self, k): + return getattr(self._module, k) + + +def deferred_import(modpath): + return _DeferredModule(modpath) From bbc7f98b01f1306794502c0251fc2960dcf0926e Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 11 Aug 2024 21:26:56 +0200 Subject: [PATCH 120/140] `PdfMatrix.mirror()`: Fix misleading terminology see changelog entry --- docs/devel/changelog_staging.md | 1 + src/pypdfium2/_helpers/matrix.py | 10 ++++++---- tests/test_nup.py | 6 +++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 6e2352600..16b5b14d4 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -15,6 +15,7 @@ * Renamed `PdfObject.get_pos()` to `.get_bounds()`. * Renamed `PdfImage.get_size()` to `.get_px_size()`. * `PdfImage.extract()`: Removed `fb_render` param because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place. +- Renamed misleading `PdfMatrix.mirror()` parameters `v, h` to `invert_x, invert_y`, as the terms horizontal/vertical flip commonly refer to the transformation applied, not the axis around which is being flipped (i.e. the previous `v` meant flipping around the Y axis, which is vertical, but the resulting transform is inverting the X coordinates and thus actually horizontal). No behavior change if you did not use keyword arguments. - `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest is None` and a dest with unknown mode. - `get_text_range()`: Removed implicit translation of default calls to `get_text_bounded()`, as pdfium reverted `FPDFText_GetText()` to UCS-2, which resolves the allocation concern. However, callers are encouraged to explicitly use `get_text_bounded()` for full Unicode support. - Removed legacy version flags. diff --git a/src/pypdfium2/_helpers/matrix.py b/src/pypdfium2/_helpers/matrix.py index 9ba1de292..935a5784d 100644 --- a/src/pypdfium2/_helpers/matrix.py +++ b/src/pypdfium2/_helpers/matrix.py @@ -127,13 +127,15 @@ def rotate(self, angle, ccw=False, rad=False): return self.multiply( PdfMatrix(c, s, -s, c) if ccw else PdfMatrix(c, -s, s, c) ) - def mirror(self, v, h): + def mirror(self, invert_x, invert_y): """ Parameters: - v (bool): Whether to mirror vertically (at the Y axis). - h (bool): Whether to mirror horizontally (at the X axis). + invert_x (bool): If True, invert X coordinates (horizontal transform). Corresponds to flipping around the Y axis. + invert_y (bool): If True, invert Y coordinates (vertical transform). Corresponds to flipping around the X axis. + Note: + Flipping around a vertical axis leads to a horizontal transform, and vice versa. """ - return self.scale(x=(-1 if v else 1), y=(-1 if h else 1)) + return self.scale(x=(-1 if invert_x else 1), y=(-1 if invert_y else 1)) def skew(self, x_angle, y_angle, rad=False): diff --git a/tests/test_nup.py b/tests/test_nup.py index 5a20f1b91..270bb2908 100644 --- a/tests/test_nup.py +++ b/tests/test_nup.py @@ -37,21 +37,21 @@ def test_xobject_placement(): assert pytest.approx(pos_a, abs=0.5) == (19, 440, 279, 823) po = xobject.as_pageobject() - matrix = base_matrix.mirror(v=True, h=False).translate(w, 0).translate(w, h) + matrix = base_matrix.mirror(invert_x=True, invert_y=False).translate(w, 0).translate(w, h) assert matrix == pdfium.PdfMatrix(-0.5, 0, 0, 0.5, 2*w, h) po.transform(matrix) dest_page_1.insert_obj(po) po = xobject.as_pageobject() assert po.get_matrix() == pdfium.PdfMatrix() - matrix = base_matrix.mirror(v=False, h=True).translate(0, h).translate(w, 0) + matrix = base_matrix.mirror(invert_x=False, invert_y=True).translate(0, h).translate(w, 0) assert matrix == pdfium.PdfMatrix(0.5, 0, 0, -0.5, w, h) po.set_matrix(matrix) assert po.get_matrix() == matrix dest_page_1.insert_obj(po) po = xobject.as_pageobject() - matrix = base_matrix.mirror(v=True, h=True).translate(w, h) + matrix = base_matrix.mirror(invert_x=True, invert_y=True).translate(w, h) assert matrix == pdfium.PdfMatrix(-0.5, 0, 0, -0.5, w, h) po.set_matrix(matrix) dest_page_1.insert_obj(po) From 98ed5365934096d817ed61b7d63893e8bad43d43 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 11 Aug 2024 21:32:59 +0200 Subject: [PATCH 121/140] changelog: explicitly mention previous `_flatten()` --- docs/devel/changelog_staging.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 16b5b14d4..a59a974ef 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -23,7 +23,7 @@ *Improvements and new features* - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates. - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object. -- Exposed `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added check and updated docs accordingly. +- Exposed `PdfPage.flatten()` (previously semi-private `_flatten()`), after having found out how to correctly use it. Added check and updated docs accordingly. - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. - If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype. - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`. From ee2f03593a90db713d710dd47ed520b211b1c038 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 26 Aug 2024 21:44:46 +0200 Subject: [PATCH 122/140] changelog nit --- docs/devel/changelog_staging.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index a59a974ef..f77d83a7b 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -24,7 +24,7 @@ - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates. - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object. - Exposed `PdfPage.flatten()` (previously semi-private `_flatten()`), after having found out how to correctly use it. Added check and updated docs accordingly. -- Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. +- Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released explicitly, given OS limits on the number of open FDs. - If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype. - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`. - Improved startup performance by deferring imports of optional dependencies to the point where they are actually needed, to avoid overhead if you do not use them. From b3f78041f0e3f57e5152941a379f7bf6068ced22 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 19 Sep 2024 02:22:46 +0200 Subject: [PATCH 123/140] Update licensing docs It is not clear to me if PDFium is "BSD-3-Clause OR Apache-2.0" or "BSD-3-Clause AND Apache-2.0". The pypdfium2 codebase previously stated "OR", but recently it hit me we don't actually have any evidence for that. In the end, I figured it was probably a presumption from the early days of the project that might as well be wrong, and that "BSD-3-Clause AND Apache-2.0" would have been the safer assumption. Sorry :( IANAL, but to my understanding both licenses are liberal and in similar spirit, so hopefully this should not have negative legal consequences downstream. Note that there is (and always was) ABSOLUTELY NO WARRANTY for any information provided with the pypdfium2 project. For pypdfium2's Readme, see the CC-BY-4.0 license (e.g. "Section 5 -- Disclaimer of Warranties and Limitation of Liability."). For pypdfium2's code (including any information provided therein), see the Apache-2.0 or BSD-3-Clause licenses, which have similar disclaimers. This patch avoids any "OR" or "AND", instead changing to a generic comma. This is not valid SPDX/reuse syntax and serves as a placeholder until we know better. Note that pypdfium2's Python code continues to be "Apache-2.0 OR BSD-3-Clause". This issue is only about PDFium itself. --- .reuse/dep5 | 4 ++-- .reuse/dep5-wheel | 2 +- README.md | 12 +++++++++--- conda/helpers/recipe/meta.yaml | 4 ++-- conda/raw/recipe/meta.yaml | 4 ++-- setup.py | 3 +-- 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/.reuse/dep5 b/.reuse/dep5 index 552d0b4af..8caad1951 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -51,7 +51,7 @@ Files: tests/resources/attachments.pdf tests/resources/mona_lisa.jpg Copyright: 2022 PDFium developers -License: BSD-3-Clause OR Apache-2.0 +License: BSD-3-Clause, Apache-2.0 Comment: Obtained from: https://pdfium.googlesource.com/pdfium/+/refs/heads/main/testing/resources/bookmarks_circular.pdf @@ -67,7 +67,7 @@ Files: Copyright: 2022 PDFium developers 2024 geisserml -License: BSD-3-Clause OR Apache-2.0 +License: BSD-3-Clause, Apache-2.0 Files: tests/resources/images.pdf Copyright: diff --git a/.reuse/dep5-wheel b/.reuse/dep5-wheel index 5bf3fe889..5b046468b 100644 --- a/.reuse/dep5-wheel +++ b/.reuse/dep5-wheel @@ -26,4 +26,4 @@ Copyright: 2024 PDFium developers 2024 Developers of projects mentioned in PdfiumThirdParty 2024 Benoît Blanchon and pdfium-binaries contributors -License: (Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty +License: (BSD-3-Clause, Apache-2.0) AND LicenseRef-PdfiumThirdParty diff --git a/README.md b/README.md index 0af837329..e92ba80d6 100644 --- a/README.md +++ b/README.md @@ -649,11 +649,17 @@ Usage should be largely self-explanatory, assuming a minimum of familiarity with ## Licensing -pypdfium2 is available by the terms and conditions of either [`Apache-2.0`](LICENSES/Apache-2.0.txt) or [`BSD-3-Clause`](LICENSES/BSD-3-Clause.txt), at your choice. -Various other open-source licenses apply to dependencies bundled with PDFium. Verbatim copies of their respective licenses are contained in the file [`LicenseRef-PdfiumThirdParty.txt`](LICENSES/LicenseRef-PdfiumThirdParty.txt), which also has to be shipped with binary redistributions. +*Important: This is NOT LEGAL ADVICE, and there is ABSOLUTELY NO WARRANTY for any information provided in this document or elsewhere in the pypdfium2 project, including earlier revisions.* + +pypdfium2 itself is available by the terms and conditions of [`Apache-2.0`](LICENSES/Apache-2.0.txt) / [`BSD-3-Clause`](LICENSES/BSD-3-Clause.txt). Documentation and examples of pypdfium2 are licensed under [`CC-BY-4.0`](LICENSES/CC-BY-4.0.txt). -pypdfium2 complies with the [reuse standard](https://reuse.software/spec/) by including [SPDX](https://spdx.org/licenses/) headers in source files, and license information for data files in [`.reuse/dep5`](.reuse/dep5). +PDFium is available under a BSD-style license that can be found in its [`LICENSE`](https://pdfium.googlesource.com/pdfium/+/refs/heads/main/LICENSE) file. +Various other open-source licenses apply to dependencies bundled with PDFium. These also have to be shipped alongside binary redistributions. Copies of identified licenses are provided in [`LicenseRef-PdfiumThirdParty.txt`](LICENSES/LicenseRef-PdfiumThirdParty.txt). +There is no guarantee of completeness, and pdfium's dependencies might change over time. Please do notify us if you think this misses a relevant license. + +pypdfium2 includes [SPDX](https://spdx.org/licenses/) headers in source files. +License information for data files is provided in [`.reuse/dep5`](.reuse/dep5) as per the [`reuse` standard](https://reuse.software/spec/). To the author's knowledge, pypdfium2 is one of the rare Python libraries that are capable of PDF rendering while not being covered by copyleft licenses (such as the `GPL`).[^liberal_pdf_renderlibs] diff --git a/conda/helpers/recipe/meta.yaml b/conda/helpers/recipe/meta.yaml index 987ad2648..06ae182b1 100644 --- a/conda/helpers/recipe/meta.yaml +++ b/conda/helpers/recipe/meta.yaml @@ -51,10 +51,10 @@ about: description: | This package provides python helpers around pdfium. Dependants are suggested to pin to a major version, but any tighter pinning is discouraged since it increases the risk for conflicts, and would lock you out from future fixes. - license: Apache-2.0 OR BSD-3-Clause + license: BSD-3-Clause, Apache-2.0 license_file: - - LICENSES/Apache-2.0.txt - LICENSES/BSD-3-Clause.txt + - LICENSES/Apache-2.0.txt - LICENSES/CC-BY-4.0.txt dev_url: https://github.com/pypdfium2-team/pypdfium2 doc_url: https://pypdfium2.readthedocs.io diff --git a/conda/raw/recipe/meta.yaml b/conda/raw/recipe/meta.yaml index c0af3fdc0..89626dc8f 100644 --- a/conda/raw/recipe/meta.yaml +++ b/conda/raw/recipe/meta.yaml @@ -52,10 +52,10 @@ about: description: | This package provides raw ctypes bindings to pdfium. Important: DO NOT PIN to an exact version, as pypdfium2_raw itself pins pdfium-binaries to achieve ABI safety. - license: Apache-2.0 OR BSD-3-Clause + license: BSD-3-Clause, Apache-2.0 license_file: - - LICENSES/Apache-2.0.txt - LICENSES/BSD-3-Clause.txt + - LICENSES/Apache-2.0.txt - LICENSES/CC-BY-4.0.txt dev_url: https://github.com/pypdfium2-team/pypdfium2 doc_url: https://pypdfium2.readthedocs.io diff --git a/setup.py b/setup.py index 6660a2b76..743da7385 100644 --- a/setup.py +++ b/setup.py @@ -81,7 +81,7 @@ def run_setup(modnames, pl_name, pdfium_ver): kwargs = dict( name = "pypdfium2", description = "Python bindings to PDFium", - license = "Apache-2.0 OR BSD-3-Clause", + license = "BSD-3-Clause, Apache-2.0, PdfiumThirdParty", license_files = LICENSES_SHARED, python_requires = ">= 3.6", cmdclass = {}, @@ -132,7 +132,6 @@ def run_setup(modnames, pl_name, pdfium_ver): kwargs["package_data"]["pypdfium2_raw"] = [VersionFN, BindingsFN, libname] kwargs["cmdclass"]["bdist_wheel"] = bdist_factory(pl_name) kwargs["distclass"] = BinaryDistribution - kwargs["license"] = f"({kwargs['license']}) AND LicenseRef-PdfiumThirdParty" kwargs["license_files"] += LICENSES_WHEEL if "pypdfium2" in kwargs["package_data"]: From ef0854e4931cff5dc2fcbb5ba6f2e24d95848231 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 19 Sep 2024 17:29:09 +0200 Subject: [PATCH 124/140] changelog: fix typo --- docs/devel/changelog_staging.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index f77d83a7b..c2c164438 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -25,7 +25,7 @@ - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object. - Exposed `PdfPage.flatten()` (previously semi-private `_flatten()`), after having found out how to correctly use it. Added check and updated docs accordingly. - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released explicitly, given OS limits on the number of open FDs. -- If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype. +- If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programmatically handle the error subtype. - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`. - Improved startup performance by deferring imports of optional dependencies to the point where they are actually needed, to avoid overhead if you do not use them. - Simplified version classes (no API change expected). From d54d0417c06603cafa124bb3c0e747869fed3794 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 19 Sep 2024 17:44:07 +0200 Subject: [PATCH 125/140] PdfPage.flatten(): add note regarding invalidation of handles --- src/pypdfium2/_helpers/page.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py index cd451a67d..9b9d28677 100644 --- a/src/pypdfium2/_helpers/page.py +++ b/src/pypdfium2/_helpers/page.py @@ -319,8 +319,8 @@ def flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY): Flatten form fields and annotations into page contents. Attention: - :meth:`~.PdfDocument.init_forms` must have been called on the parent pdf, before the page was retrieved, for this method to work. - In other words, :attr:`.PdfPage.formenv` must be non-null. + * :meth:`~.PdfDocument.init_forms` must have been called on the parent pdf, before the page was retrieved, for this method to work. In other words, :attr:`.PdfPage.formenv` must be non-null. + * Flattening may invalidate existing handles to the page, so you'll want to re-initialize them after flattening. Parameters: flag (int): PDFium flattening target (:attr:`FLAT_*`) From 51d88994837a5673020dabdfe87694b257003d16 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 27 Oct 2024 00:20:59 +0200 Subject: [PATCH 126/140] `PdfBitmap.to_numpy()` Use 2d shape for single-channel bitmap --- docs/devel/changelog_staging.md | 1 + src/pypdfium2/_helpers/bitmap.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index c2c164438..5309188d5 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -9,6 +9,7 @@ - Rendering / Bitmap * Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). Instead, use `PdfPage.render()` with a loop or process pool. * Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`. + * `PdfBitmap.to_numpy()`: If the bitmap is single-channel (grayscale), use a 2d shape to avoid needlessly wrapping each pixel value in a list. * `PdfBitmap.from_pil()`: Removed `recopy` param. * Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark theme" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion, as is now implemented in pypdfium2's rendering CLI. - Pageobjects diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index 10e5389e1..597ac6947 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -197,7 +197,7 @@ def to_numpy(self): The array contains as many rows as the bitmap is high. Each row contains as many pixels as the bitmap is wide. - The length of each pixel corresponds to the number of channels. + Each pixel will be an array of values per channel, or just a value if there is only one channel. The resulting array is supposed to share memory with the original bitmap buffer, so changes to the buffer should be reflected in the array, and vice versa. @@ -210,11 +210,11 @@ def to_numpy(self): array = numpy.ndarray( # layout: row major - shape = (self.height, self.width, self.n_channels), + shape = (self.height, self.width, self.n_channels) if self.n_channels > 1 else (self.height, self.width), dtype = ctypes.c_ubyte, buffer = self.buffer, - # number of bytes per item for each nesting level (outer->inner, i. e. row, pixel, value) - strides = (self.stride, self.n_channels, 1), + # number of bytes per item for each nesting level (outer->inner: row, pixel, value - or row, value for a single-channel bitmap) + strides = (self.stride, self.n_channels, 1) if self.n_channels > 1 else (self.stride, 1), ) return array From 7f12ceeac6e5e2ad0483e12b7cb64b897655255b Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 27 Oct 2024 17:38:30 +0100 Subject: [PATCH 127/140] version.py: minor cleanup --- src/pypdfium2/version.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py index ec280f8dc..3bca4a666 100644 --- a/src/pypdfium2/version.py +++ b/src/pypdfium2/version.py @@ -81,15 +81,7 @@ def _hook(self): self.desc += f"@{self.origin}" -# API - PYPDFIUM_INFO = _version_pypdfium2() -PDFIUM_INFO = _version_pdfium() - - -# Docs - -PYPDFIUM_INFO = PYPDFIUM_INFO """ pypdfium2 helpers version. @@ -129,7 +121,7 @@ def _hook(self): """ -PDFIUM_INFO = PDFIUM_INFO +PDFIUM_INFO = _version_pdfium() """ PDFium version. From 195ce71f2d5d3799adc838e879202b76ae33663d Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 30 Oct 2024 23:06:11 +0100 Subject: [PATCH 128/140] CLI(renderer/pageobjects): slightly improve code style --- src/pypdfium2/_cli/_parsers.py | 16 +++++++++++++++- src/pypdfium2/_cli/pageobjects.py | 23 ++++++++++------------- src/pypdfium2/_cli/render.py | 22 ++++++---------------- 3 files changed, 31 insertions(+), 30 deletions(-) diff --git a/src/pypdfium2/_cli/_parsers.py b/src/pypdfium2/_cli/_parsers.py index abffe4e5d..6ff8bdfa6 100644 --- a/src/pypdfium2/_cli/_parsers.py +++ b/src/pypdfium2/_cli/_parsers.py @@ -3,8 +3,8 @@ import os import sys -import argparse import logging +import argparse from pathlib import Path import pypdfium2._helpers as pdfium import pypdfium2.internal as pdfium_i @@ -91,6 +91,20 @@ def get_input(args, init_forms=False, **kwargs): return pdf +# dummy more_itertools.peekable().__bool__ alternative + +def _postpeek_generator(value, iterator): + yield value; yield from iterator + +def iterator_hasvalue(iterator): + try: + first_value = next(iterator) + except StopIteration: + return False, None + else: + return True, _postpeek_generator(first_value, iterator) + + if sys.version_info >= (3, 9): from argparse import BooleanOptionalAction diff --git a/src/pypdfium2/_cli/pageobjects.py b/src/pypdfium2/_cli/pageobjects.py index 933fe0ab8..7272d08f4 100644 --- a/src/pypdfium2/_cli/pageobjects.py +++ b/src/pypdfium2/_cli/pageobjects.py @@ -3,7 +3,6 @@ # TODO test-confirm filter and info params -from itertools import chain from collections import OrderedDict import pypdfium2._helpers as pdfium import pypdfium2.internal as pdfium_i @@ -13,6 +12,7 @@ add_n_digits, get_input, round_list, + iterator_hasvalue, ) @@ -43,7 +43,7 @@ def attach(parser): ) parser.add_argument( "--info", - nargs = "*", + nargs = "+", type = str.lower, choices = INFO_PARAMS, default = INFO_PARAMS, @@ -76,24 +76,21 @@ def main(args): if args.filter: args.filter = [pdfium_i.ObjectTypeToConst[t] for t in args.filter] - show_pos = (PARAM_POS in args.info) - show_imageinfo = (PARAM_IMGINFO in args.info) - total_count = 0 + show_pos = PARAM_POS in args.info + show_imginfo = PARAM_IMGINFO in args.info + assert show_pos or show_imginfo + total_count = 0 for i in args.pages: page = pdf[i] - obj_searcher = page.get_objects(args.filter, max_depth=args.max_depth) - # note, more_itertools.peekable() could handle this more elegantly - try: - first_obj = next(obj_searcher) - except StopIteration: - continue + hasvalue, obj_searcher = iterator_hasvalue( page.get_objects(args.filter, max_depth=args.max_depth) ) + if not hasvalue: continue print(f"# Page {i+1}") count = 0 - for obj in chain([first_obj], obj_searcher): + for obj in obj_searcher: pad_0 = " " * obj.level pad_1 = pad_0 + " " @@ -106,7 +103,7 @@ def main(args): quad_bounds = obj.get_quad_points() print(pad_1 + f"Quad Points: {[round_list(p, args.n_digits) for p in quad_bounds]}") - if show_imageinfo and isinstance(obj, pdfium.PdfImage): + if show_imginfo and isinstance(obj, pdfium.PdfImage): print(pad_1 + f"Filters: {obj.get_filters()}") metadata = obj.get_metadata() assert (metadata.width, metadata.height) == obj.get_px_size() diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index b87233f2a..329d2a663 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -17,6 +17,7 @@ from pypdfium2._cli._parsers import ( add_input, get_input, setup_logging, + iterator_hasvalue, BooleanOptionalAction, ) @@ -288,37 +289,26 @@ def _saving_hook(self, out_path, bitmap, page, postproc_kwargs): @classmethod def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): dst_image = src_image - if invert_lightness: - if bitmap.format == pdfium_c.FPDFBitmap_Gray: dst_image = ~src_image else: - - if bitmap.rev_byteorder: - convert_to = cv2.COLOR_RGB2HLS - convert_from = cv2.COLOR_HLS2RGB - else: - convert_to = cv2.COLOR_BGR2HLS - convert_from = cv2.COLOR_HLS2BGR - + convert_to, convert_from = (cv2.COLOR_RGB2HLS, cv2.COLOR_HLS2RGB) if bitmap.rev_byteorder else (cv2.COLOR_BGR2HLS, cv2.COLOR_HLS2BGR) dst_image = cv2.cvtColor(dst_image, convert_to) h, l, s = cv2.split(dst_image) l = ~l dst_image = cv2.merge([h, l, s]) dst_image = cv2.cvtColor(dst_image, convert_from) - if exclude_images: - assert bitmap.format != pdfium_c.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2" + assert bitmap.format != pdfium_c.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2" # FIXME? posconv = bitmap.get_posconv(page) - image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1)) - if len(image_objs) > 0: + have_images, obj_searcher = iterator_hasvalue( page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1) ) + if have_images: mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8) - for obj in image_objs: + for obj in obj_searcher: qpoints = np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32) cv2.fillPoly(mask, [qpoints], 1) dst_image = cv2.copyTo(src_image, mask=mask, dst=dst_image) - return dst_image From 5362127d9e534124881cf46cdd3bc1653ab0e187 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 25 Nov 2024 16:59:09 +0100 Subject: [PATCH 129/140] Fix some dirty code in pdfium build script had two consecutive use_syslibs if-blocks that could be merged into one. --- setupsrc/pypdfium2_setup/build_pdfium.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/setupsrc/pypdfium2_setup/build_pdfium.py b/setupsrc/pypdfium2_setup/build_pdfium.py index b2424bc4e..2306d6149 100755 --- a/setupsrc/pypdfium2_setup/build_pdfium.py +++ b/setupsrc/pypdfium2_setup/build_pdfium.py @@ -88,20 +88,18 @@ def dl_depottools(do_update): def dl_pdfium(GClient, do_update, revision): - is_sync = True - if PDFiumDir.exists(): if do_update: print("PDFium: Revert / Sync ...") run_cmd([GClient, "revert"], cwd=SBDir) else: - is_sync = False print("PDFium: Using existing repository as-is.") else: print("PDFium: Download ...") + do_update = True run_cmd([GClient, "config", "--custom-var", "checkout_configuration=minimal", "--unmanaged", PdfiumURL], cwd=SBDir) - if is_sync: + if do_update: # TODO consider passing -D ? run_cmd([GClient, "sync", "--revision", f"origin/{revision}", "--no-history", "--shallow"], cwd=SBDir) # quick & dirty fix to make a versioned commit available (pdfium gets tagged frequently, so this should be more than enough in practice) @@ -109,7 +107,7 @@ def dl_pdfium(GClient, do_update, revision): run_cmd(["git", "fetch", "--depth=100"], cwd=PDFiumDir) run_cmd(["git", "fetch", "--depth=100"], cwd=PDFiumDir) - return is_sync + return do_update def _dl_unbundler(): @@ -245,21 +243,19 @@ def main( GN = get_tool("gn") Ninja = get_tool("ninja") - pdfium_dl_done = dl_pdfium(GClient, b_update, b_revision) + did_pdfium_sync = dl_pdfium(GClient, b_update, b_revision) v_short, v_post = identify_pdfium() print(f"Version {v_short} {v_post}", file=sys.stderr) - if pdfium_dl_done: + if did_pdfium_sync: patch_pdfium(v_short) - if b_use_syslibs: - _dl_unbundler() - - if b_use_syslibs: - run_cmd(["python3", "build/linux/unbundle/replace_gn_files.py", "--system-libraries", "icu"], cwd=PDFiumDir) config_dict = DefaultConfig.copy() if b_use_syslibs: + _dl_unbundler() + run_cmd(["python3", "build/linux/unbundle/replace_gn_files.py", "--system-libraries", "icu"], cwd=PDFiumDir) config_dict.update(SyslibsConfig) + config_str = serialise_config(config_dict) print(f"\nBuild configuration:\n{config_str}\n") From cc728db5823dcab29bdc5893375f1d5e7cd88582 Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 27 Nov 2024 00:36:01 +0100 Subject: [PATCH 130/140] Consistently use iterator_hasvalue() --- src/pypdfium2/_cli/render.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index 329d2a663..03e9f9012 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -262,11 +262,11 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images) if exclude_images: # FIXME pdfium does not seem to provide APIs to translate XObject to page coordinates, so not sure how to handle images nested in XObjects. # FIXME we'd also like to take alpha masks into account, but this may be difficult as long as pdfium does not expose them directly. - image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1)) - if len(image_objs) > 0: + have_images, obj_walker = iterator_hasvalue( page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1) ) + if have_images: mask = PIL.Image.new("1", src_image.size) draw = PIL.ImageDraw.Draw(mask) - for obj in image_objs: + for obj in obj_walker: qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()] draw.polygon(qpoints, fill=1) dst_image.paste(src_image, mask=mask) @@ -302,10 +302,10 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images): if exclude_images: assert bitmap.format != pdfium_c.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2" # FIXME? posconv = bitmap.get_posconv(page) - have_images, obj_searcher = iterator_hasvalue( page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1) ) + have_images, obj_walker = iterator_hasvalue( page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1) ) if have_images: mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8) - for obj in obj_searcher: + for obj in obj_walker: qpoints = np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32) cv2.fillPoly(mask, [qpoints], 1) dst_image = cv2.copyTo(src_image, mask=mask, dst=dst_image) From d39dbf8cd8e493542522a0c378d01a72b4118ace Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 27 Nov 2024 01:16:20 +0100 Subject: [PATCH 131/140] fix awkward list default having a mutable default parameter is dangerous / bad practice (although it was not immediately harmful in this instance) --- src/pypdfium2/version.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py index 3bca4a666..2be800a8f 100644 --- a/src/pypdfium2/version.py +++ b/src/pypdfium2/version.py @@ -32,12 +32,12 @@ def __repr__(self): def _craft_tag(self): return ".".join(str(v) for v in self.api_tag) - def _craft_desc(self, suffix=[]): + def _craft_desc(self, *suffixes): local_ver = [] if self.n_commits > 0: local_ver += [str(self.n_commits), str(self.hash)] - local_ver += suffix + local_ver += suffixes desc = "" if local_ver: @@ -56,8 +56,8 @@ def _hook(self): if self.beta is not None: self.tag += f"b{self.beta}" - suffix = ["dirty"] if self.dirty else [] - self.desc = self._craft_desc(suffix) + suffixes = ["dirty"] if self.dirty else [] + self.desc = self._craft_desc(*suffixes) if self.data_source != "git": self.desc += f":{self.data_source}" if self.is_editable: From 2778473241cb457e6737f8cd4d7a1edf77661f98 Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 27 Nov 2024 01:58:12 +0100 Subject: [PATCH 132/140] Avoid bool dicts --- src/pypdfium2/_helpers/bitmap.py | 8 ++++---- src/pypdfium2/_helpers/pageobjects.py | 12 ++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py index 597ac6947..403c68ce4 100644 --- a/src/pypdfium2/_helpers/bitmap.py +++ b/src/pypdfium2/_helpers/bitmap.py @@ -57,10 +57,10 @@ def __init__(self, raw, buffer, width, height, stride, format, rev_byteorder, ne self.format = format self.rev_byteorder = rev_byteorder self.n_channels = pdfium_i.BitmapTypeToNChannels[self.format] - self.mode = { - False: pdfium_i.BitmapTypeToStr, - True: pdfium_i.BitmapTypeToStrReverse, - }[self.rev_byteorder][self.format] + self.mode = ( + pdfium_i.BitmapTypeToStrReverse if self.rev_byteorder else \ + pdfium_i.BitmapTypeToStr + )[self.format] # slot to store arguments for PdfPosConv, set on page rendering self._pos_args = None diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 7565b1598..f59b8659b 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -235,10 +235,8 @@ def load_jpeg(self, source, pages=None, inline=False, autoclose=True): raise ValueError(f"Cannot load JPEG from {source} - not a file path or byte stream.") bufaccess, to_hold = pdfium_i.get_bufreader(buffer) - loader = { - False: pdfium_c.FPDFImageObj_LoadJpegFile, - True: pdfium_c.FPDFImageObj_LoadJpegFileInline, - }[inline] + loader = pdfium_c.FPDFImageObj_LoadJpegFileInline if inline else \ + pdfium_c.FPDFImageObj_LoadJpegFile c_pages, page_count = pdfium_i.pages_c_array(pages) ok = loader(c_pages, page_count, self, bufaccess) @@ -306,10 +304,8 @@ def get_data(self, decode_simple=False): Returns: ctypes.Array: The data of the image stream (as :class:`~ctypes.c_ubyte` array). """ - func = { - False: pdfium_c.FPDFImageObj_GetImageDataRaw, - True: pdfium_c.FPDFImageObj_GetImageDataDecoded, - }[decode_simple] + func = pdfium_c.FPDFImageObj_GetImageDataDecoded if decode_simple else \ + pdfium_c.FPDFImageObj_GetImageDataRaw n_bytes = func(self, None, 0) buffer = (ctypes.c_ubyte * n_bytes)() func(self, buffer, n_bytes) From 58f508af629c290ec62dbdafecd4100024864b6d Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 27 Nov 2024 02:02:55 +0100 Subject: [PATCH 133/140] fix awkward formatting w/ auto-wrap alternatively, we could put the + on the beginning of the other line but in this case it's easiest to just omit it --- setupsrc/pypdfium2_setup/autorelease.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setupsrc/pypdfium2_setup/autorelease.py b/setupsrc/pypdfium2_setup/autorelease.py index 1f5525f48..979cdca39 100644 --- a/setupsrc/pypdfium2_setup/autorelease.py +++ b/setupsrc/pypdfium2_setup/autorelease.py @@ -190,7 +190,7 @@ def main(): parsed_helpers = parse_git_tag() if new_helpers != parsed_helpers: print( - "Warning: Written and parsed helpers do not match. This should not happen in CI.\n" + + "Warning: Written and parsed helpers do not match. This should not happen in CI.\n" f"In: {new_helpers}\n" + f"Out: {parsed_helpers}" ) make_releasenotes(summary, record["pdfium"], new_pdfium, prev_tag, new_tag, c_updates) From 11469fec6cea357d3f6a4241cc640884b5900e79 Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 18 Dec 2024 20:23:04 +0100 Subject: [PATCH 134/140] add two FIXMEs --- src/pypdfium2/_library_scope.py | 1 + tests/test_misc.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/pypdfium2/_library_scope.py b/src/pypdfium2/_library_scope.py index d66daf21e..54c5cbb85 100644 --- a/src/pypdfium2/_library_scope.py +++ b/src/pypdfium2/_library_scope.py @@ -10,6 +10,7 @@ def init_lib(): assert not pdfium_i.LIBRARY_AVAILABLE if pdfium_i.DEBUG_AUTOCLOSE: + # FIXME never shown, because DEBUG_AUTOCLOSE can only be set on the caller side after pypdfium2 has been imported... print("Initialize PDFium (auto)", file=sys.stderr) # PDFium init API may change in the future: https://crbug.com/pdfium/1446 diff --git a/tests/test_misc.py b/tests/test_misc.py index 739fe9e98..33e72c173 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -40,6 +40,7 @@ def _filter(prefix, skips=[], type=int): BitmapNsp = _filter("FPDFBitmap_", [pdfium_c.FPDFBitmap_Unknown]) PageObjNsp = _filter("FPDF_PAGEOBJ_") ErrorMapping = pdfium_i.ErrorToStr +# FIXME this will cause an erroneous test failure when using the reference bindings with a non-XFA build if "XFA" in PDFIUM_INFO.flags: ErrorMapping.update(pdfium_i.XFAErrorToStr) From 379d9b5a949cc4bf0de2dd63636b765545b07dc7 Mon Sep 17 00:00:00 2001 From: geisserml Date: Wed, 18 Dec 2024 20:59:33 +0100 Subject: [PATCH 135/140] First steps towards android detection --- setupsrc/pypdfium2_setup/packaging_base.py | 34 +++++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py index 887a6c6b1..481e7c0c9 100644 --- a/setupsrc/pypdfium2_setup/packaging_base.py +++ b/setupsrc/pypdfium2_setup/packaging_base.py @@ -305,8 +305,12 @@ def __init__(self): # If we are on Linux, check if we have glibc or musl self._libc_name, self._libc_ver = _get_libc_info() - # TODO consider cached property for platform and system - self.platform = self._get_platform() + # TODO consider cached property for platform and system? + try: + self.platform = self._get_platform() + except Exception as e: + self.platform = None + self._exc = e self.system = None if self.platform is not None: self.system = plat_to_system(self.platform) @@ -320,19 +324,30 @@ def __repr__(self): def _is_plat(self, system, machine): return self._system_name.startswith(system) and self._machine_name.startswith(machine) + def _handle_linux_libc(self, archid): + if self._libc_name == "glibc": + return getattr(PlatNames, f"linux_{archid}") + elif self._libc_name == "musl": + return getattr(PlatNames, f"linux_musl_{archid}") + elif self._libc_name == "libc": + raise RuntimeError(f"Android {archid!r} prior to PEP 738 - not handled in pypdfium2 yet.") + else: + raise RuntimeError(f"Linux with unhandled libc {self._libc_name!r}.") + def _get_platform(self): - # some machine names are merely "qualified guesses", mistakes can't be fully excluded for platforms we don't have access to if self._is_plat("darwin", "x86_64"): return PlatNames.darwin_x64 elif self._is_plat("darwin", "arm64"): return PlatNames.darwin_arm64 elif self._is_plat("linux", "x86_64"): - return PlatNames.linux_x64 if self._libc_name != "musl" else PlatNames.linux_musl_x64 + return self._handle_linux_libc("x64") elif self._is_plat("linux", "i686"): - return PlatNames.linux_x86 if self._libc_name != "musl" else PlatNames.linux_musl_x86 + return self._handle_linux_libc("x86") elif self._is_plat("linux", "aarch64"): - return PlatNames.linux_arm64 if self._libc_name != "musl" else PlatNames.linux_musl_arm64 + return self._handle_linux_libc("arm64") elif self._is_plat("linux", "armv7l"): + if self._libc_name != "glibc": + raise RuntimeError(f"armv7l: only glibc supported at this time, you have {self._libc_name!r}") # no musl/android return PlatNames.linux_arm32 elif self._is_plat("windows", "amd64"): return PlatNames.windows_x64 @@ -340,8 +355,10 @@ def _get_platform(self): return PlatNames.windows_arm64 elif self._is_plat("windows", "x86"): return PlatNames.windows_x86 + elif self._system_name.startswith("android"): + raise RuntimeError(f"Android {self._machine_name!r} with PEP 738 - not handled in pypdfium2 yet.") else: - return None + raise RuntimeError(f"Unhandled platform: {self!r}") Host = _host_platform() @@ -608,7 +625,8 @@ def parse_pl_spec(pl_spec, with_prepare=True): if not pl_spec or pl_spec == "auto": pl_name = Host.platform if pl_name is None: - raise RuntimeError(f"No pre-built binaries available for {Host}. You may place custom binaries & bindings in data/sourcebuild and install with `{PlatSpec_EnvVar}=sourcebuild`.") + print(f"No pre-built binaries available for this host. You may place custom binaries & bindings in data/sourcebuild/ and install with `{PlatSpec_EnvVar}=sourcebuild`.", file=sys.stderr) + raise Host._exc elif hasattr(ExtPlats, pl_spec): pl_name = getattr(ExtPlats, pl_spec) elif hasattr(PlatNames, pl_spec): From 4b90faa331fc7ec29fee75a64f7b44e2d7c4986f Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 19 Dec 2024 21:32:13 +0100 Subject: [PATCH 136/140] Update test expectation for toc_maxdepth --- tests/expectations/toc_maxdepth.txt | 37 +++++++++++++++-------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/expectations/toc_maxdepth.txt b/tests/expectations/toc_maxdepth.txt index beeacb932..889fe0075 100644 --- a/tests/expectations/toc_maxdepth.txt +++ b/tests/expectations/toc_maxdepth.txt @@ -1,20 +1,21 @@ -[+] 1.outline -> 1 # FitH [746.439] - [+] 1.1.outline -> 1 # FitH [700.878] - [+] 1.1.1.outline -> 1 # FitH [632.537] - [+] 1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.outline -> 1 # FitH [597.304] - [+] 1.1.1.1.1.1outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] - [+] 1.1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] -[+] 2.outline -> 2 # FitH [749.477] - [+] 2.1.outline -> 2 # FitH [699.36] - [+] 2.1.1.outline -> 2 # FitH [628.74] +[+100] 1.outline -> 1 # FitH [746.439] + [+100] 1.1.outline -> 1 # FitH [700.878] + [+1] 1.1.1.outline -> 1 # FitH [632.537] + [+1] 1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.outline -> 1 # FitH [597.304] + [+1] 1.1.1.1.1.1outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] + [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1 # FitH [632.946] +Maximum recursion depth 15 reached (subtree skipped). +[+100] 2.outline -> 2 # FitH [749.477] + [+100] 2.1.outline -> 2 # FitH [699.36] + [+100] 2.1.1.outline -> 2 # FitH [628.74] [*] 2.1.1.1.outline -> 2 # FitH [583.179] [*] 2.2 outline -> 2 # FitH [515.218] From f0409fc164b29871f3b5f3a835329e965e2aab88 Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 19 Dec 2024 21:44:17 +0100 Subject: [PATCH 137/140] Clean up & tighten platform detection --- setupsrc/pypdfium2_setup/packaging_base.py | 51 +++++++++++----------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py index 481e7c0c9..8df8e2d9b 100644 --- a/setupsrc/pypdfium2_setup/packaging_base.py +++ b/setupsrc/pypdfium2_setup/packaging_base.py @@ -321,9 +321,6 @@ def __repr__(self): info += f", {self._libc_name} {self._libc_ver}" return f"" - def _is_plat(self, system, machine): - return self._system_name.startswith(system) and self._machine_name.startswith(machine) - def _handle_linux_libc(self, archid): if self._libc_name == "glibc": return getattr(PlatNames, f"linux_{archid}") @@ -335,30 +332,32 @@ def _handle_linux_libc(self, archid): raise RuntimeError(f"Linux with unhandled libc {self._libc_name!r}.") def _get_platform(self): - if self._is_plat("darwin", "x86_64"): - return PlatNames.darwin_x64 - elif self._is_plat("darwin", "arm64"): - return PlatNames.darwin_arm64 - elif self._is_plat("linux", "x86_64"): - return self._handle_linux_libc("x64") - elif self._is_plat("linux", "i686"): - return self._handle_linux_libc("x86") - elif self._is_plat("linux", "aarch64"): - return self._handle_linux_libc("arm64") - elif self._is_plat("linux", "armv7l"): - if self._libc_name != "glibc": - raise RuntimeError(f"armv7l: only glibc supported at this time, you have {self._libc_name!r}") # no musl/android - return PlatNames.linux_arm32 - elif self._is_plat("windows", "amd64"): - return PlatNames.windows_x64 - elif self._is_plat("windows", "arm64"): - return PlatNames.windows_arm64 - elif self._is_plat("windows", "x86"): - return PlatNames.windows_x86 - elif self._system_name.startswith("android"): + if self._system_name == "darwin": + if self._machine_name == "x86_64": + return PlatNames.darwin_x64 + elif self._machine_name == "arm64": + return PlatNames.darwin_arm64 + elif self._system_name == "linux": + if self._machine_name == "x86_64": + return self._handle_linux_libc("x64") + elif self._machine_name == "i686": + return self._handle_linux_libc("x86") + elif self._machine_name == "aarch64": + return self._handle_linux_libc("arm64") + elif self._machine_name == "armv7l": + if self._libc_name != "glibc": + raise RuntimeError(f"armv7l: only glibc supported at this time, you have {self._libc_name!r}") # no musl/android + return PlatNames.linux_arm32 + elif self._system_name == "windows": + if self._machine_name == "amd64": + return PlatNames.windows_x64 + elif self._machine_name == "x86": + return PlatNames.windows_x86 + elif self._machine_name == "arm64": + return PlatNames.windows_arm64 + elif self._system_name == "android": raise RuntimeError(f"Android {self._machine_name!r} with PEP 738 - not handled in pypdfium2 yet.") - else: - raise RuntimeError(f"Unhandled platform: {self!r}") + raise RuntimeError(f"Unhandled platform: {self!r}") Host = _host_platform() From 8d87525d356f430f327a3ae28d470cfbbf2e13c9 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 27 Dec 2024 01:20:56 +0100 Subject: [PATCH 138/140] Build reference bindings without srcinfo CC #335 --- setupsrc/pypdfium2_setup/autorelease.py | 2 +- setupsrc/pypdfium2_setup/packaging_base.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/setupsrc/pypdfium2_setup/autorelease.py b/setupsrc/pypdfium2_setup/autorelease.py index 979cdca39..69a77a91d 100644 --- a/setupsrc/pypdfium2_setup/autorelease.py +++ b/setupsrc/pypdfium2_setup/autorelease.py @@ -23,7 +23,7 @@ def run_local(*args, **kws): def update_refbindings(version): RefBindingsFile.unlink() - build_pdfium_bindings(version, guard_symbols=True, flags=REFBINDINGS_FLAGS, allow_system_despite_libdirs=True) + build_pdfium_bindings(version, guard_symbols=True, flags=REFBINDINGS_FLAGS, allow_system_despite_libdirs=True, no_srcinfo=True) shutil.copyfile(DataDir_Bindings/BindingsFN, RefBindingsFile) assert RefBindingsFile.exists() diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py index 8df8e2d9b..7e4cbf437 100644 --- a/setupsrc/pypdfium2_setup/packaging_base.py +++ b/setupsrc/pypdfium2_setup/packaging_base.py @@ -444,7 +444,7 @@ def tmp_cwd_context(tmp_cwd): os.chdir(orig_cwd) -def run_ctypesgen(target_dir, headers_dir, flags=[], guard_symbols=False, compile_lds=[], run_lds=["."], allow_system_despite_libdirs=False): +def run_ctypesgen(target_dir, headers_dir, flags=[], compile_lds=[], run_lds=["."], allow_system_despite_libdirs=False, guard_symbols=False, no_srcinfo=False): # Import ctypesgen only in this function so it does not have to be available for other setup tasks import ctypesgen assert getattr(ctypesgen, "PYPDFIUM2_SPECIFIC", False), "pypdfium2 requires fork of ctypesgen" @@ -465,6 +465,8 @@ def run_ctypesgen(target_dir, headers_dir, flags=[], guard_symbols=False, compil args += ["--no-macro-guards"] if not guard_symbols: args += ["--no-symbol-guards"] + if no_srcinfo: + args += ["--no-srcinfo"] # pre-processor - if not given, pypdfium2-ctypesgen will try to auto-select as available (gcc/clang) c_preproc = os.environ.get("CPP", None) From bc0d92e192e1ebb0a3d10e6f820cf1edd6381269 Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 27 Dec 2024 01:30:17 +0100 Subject: [PATCH 139/140] Fix perilous mutable defaults I don't think there were any actual issues, but in general this is just too risky. --- setupsrc/pypdfium2_setup/craft_packages.py | 2 +- setupsrc/pypdfium2_setup/emplace.py | 2 +- setupsrc/pypdfium2_setup/packaging_base.py | 8 ++++---- tests/test_misc.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/setupsrc/pypdfium2_setup/craft_packages.py b/setupsrc/pypdfium2_setup/craft_packages.py index de42bc421..cebf48718 100644 --- a/setupsrc/pypdfium2_setup/craft_packages.py +++ b/setupsrc/pypdfium2_setup/craft_packages.py @@ -110,7 +110,7 @@ def main_pypi(args): clean_platfiles() -def run_conda_build(recipe_dir, out_dir, args=[]): +def run_conda_build(recipe_dir, out_dir, args=()): with TmpCommitCtx(): run_cmd(["conda", "build", recipe_dir, "--output-folder", out_dir, *args], cwd=ProjectDir, env=os.environ) diff --git a/setupsrc/pypdfium2_setup/emplace.py b/setupsrc/pypdfium2_setup/emplace.py index fe15a5fcf..81bfae246 100644 --- a/setupsrc/pypdfium2_setup/emplace.py +++ b/setupsrc/pypdfium2_setup/emplace.py @@ -53,7 +53,7 @@ def prepare_setup(pl_name, pdfium_ver, use_v8): if pl_name == ExtPlats.system: # TODO add option for caller to pass in custom headers_dir, run_lds and flags? unfortunately it's not straightforward how to integrate this # also want to consider accepting a full version for offline setup - build_pdfium_bindings(pdfium_ver, flags=flags, guard_symbols=True, run_lds=[]) + build_pdfium_bindings(pdfium_ver, flags=flags, guard_symbols=True, run_lds=()) shutil.copyfile(DataDir_Bindings/BindingsFN, ModuleDir_Raw/BindingsFN) write_pdfium_info(ModuleDir_Raw, pdfium_ver, origin="system", flags=flags) return [BindingsFN, VersionFN] diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py index 7e4cbf437..e518e3820 100644 --- a/setupsrc/pypdfium2_setup/packaging_base.py +++ b/setupsrc/pypdfium2_setup/packaging_base.py @@ -202,8 +202,8 @@ def write_json(fp, data, indent=2): return json.dump(data, buf, indent=indent) -def write_pdfium_info(dir, build, origin, flags=[], n_commits=0, hash=None): - info = dict(**PdfiumVer.to_full(build)._asdict(), n_commits=n_commits, hash=hash, origin=origin, flags=flags) +def write_pdfium_info(dir, build, origin, flags=(), n_commits=0, hash=None): + info = dict(**PdfiumVer.to_full(build)._asdict(), n_commits=n_commits, hash=hash, origin=origin, flags=list(flags)) write_json(dir/VersionFN, info) return info @@ -444,7 +444,7 @@ def tmp_cwd_context(tmp_cwd): os.chdir(orig_cwd) -def run_ctypesgen(target_dir, headers_dir, flags=[], compile_lds=[], run_lds=["."], allow_system_despite_libdirs=False, guard_symbols=False, no_srcinfo=False): +def run_ctypesgen(target_dir, headers_dir, flags=(), compile_lds=(), run_lds=(".", ), allow_system_despite_libdirs=False, guard_symbols=False, no_srcinfo=False): # Import ctypesgen only in this function so it does not have to be available for other setup tasks import ctypesgen assert getattr(ctypesgen, "PYPDFIUM2_SPECIFIC", False), "pypdfium2 requires fork of ctypesgen" @@ -491,7 +491,7 @@ def run_ctypesgen(target_dir, headers_dir, flags=[], compile_lds=[], run_lds=[". def build_pdfium_bindings(version, headers_dir=None, **kwargs): - defaults = dict(flags=[], run_lds=["."], guard_symbols=False) + defaults = dict(flags=(), run_lds=(".", ), guard_symbols=False) for k, v in defaults.items(): kwargs.setdefault(k, v) diff --git a/tests/test_misc.py b/tests/test_misc.py index 33e72c173..822edc80c 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -27,7 +27,7 @@ def test_color_tohex(color_in, rev_byteorder, exp_color): assert pdfium_c.FPDF_GetBValue(exp_color) == channels[3] -def _filter(prefix, skips=[], type=int): +def _filter(prefix, skips=(), type=int): items = [] for attr in dir(pdfium_c): value = getattr(pdfium_c, attr) From 820f5c53b4b572ccc3131067ecca7667b6a63017 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 30 Dec 2024 21:10:53 +0100 Subject: [PATCH 140/140] Warn about --optimize-mode lcd with --invert-lightness Unfortunately, colour post-processing destroys LCD optimization --- src/pypdfium2/_cli/render.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py index 03e9f9012..6d4567205 100644 --- a/src/pypdfium2/_cli/render.py +++ b/src/pypdfium2/_cli/render.py @@ -196,7 +196,7 @@ def attach(parser): postproc.add_argument( "--invert-lightness", action = "store_true", - help = "Invert lightness using the HLS color space (e.g. white<->black, dark_blue<->light_blue). The intent is to achieve a dark theme for documents with light background, while providing better visual results than classical color inversion or a flat pdfium color scheme.", + help = "Invert lightness using the HLS color space (e.g. white<->black, dark_blue<->light_blue). The intent is to achieve a dark theme for documents with light background, while providing better visual results than classical color inversion or a flat pdfium color scheme. However, note that --optimize-mode lcd is not recommendable when inverting lightness.", ) postproc.add_argument( "--exclude-images", @@ -393,6 +393,8 @@ def main(args): invert_lightness = args.invert_lightness, exclude_images = args.exclude_images, ) + if args.invert_lightness and args.optimize_mode == "lcd": + logger.warning("LCD optimization clashes with lightness inversion, as post-processing colours defeats the idea of subpixel rendering.") # TODO dump all args except password? logger.info(f"{args.engine_cls.__name__}, Format: {args.format}, rev_byteorder: {args.rev_byteorder}, prefer_bgrx {args.prefer_bgrx}")