From 9856cfce107dced253ba00ad252328cf6361908d Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 4 Apr 2024 15:48:12 +0200
Subject: [PATCH 001/140] Work on API-breaking changes (bookmarks)

This backports (and slightly improves) the new bookmark API from
devel_new. Test suite TBD.
---
 src/pypdfium2/_cli/toc.py          |  31 ++---
 src/pypdfium2/_helpers/document.py | 174 +++++++++++++++--------------
 2 files changed, 111 insertions(+), 94 deletions(-)

diff --git a/src/pypdfium2/_cli/toc.py b/src/pypdfium2/_cli/toc.py
index 6921c2af8..f05f50d6c 100644
--- a/src/pypdfium2/_cli/toc.py
+++ b/src/pypdfium2/_cli/toc.py
@@ -25,18 +25,23 @@ def attach(parser):
 def main(args):
     
     pdf = get_input(args)
-    toc = pdf.get_toc(
-        max_depth = args.max_depth,
-    )
+    toc = pdf.get_toc(max_depth=args.max_depth)
     
-    for item in toc:
-        state = "*" if item.n_kids == 0 else "-" if item.is_closed else "+"
-        target = "?" if item.page_index is None else item.page_index+1
-        print(
-            "    " * item.level +
-            "[%s] %s -> %s  # %s %s" % (
-                state, item.title, target,
-                pdfium_i.ViewmodeToStr.get(item.view_mode),
-                round_list(item.view_pos, args.n_digits),
-            )
+    for bm in toc:
+        count, dest = bm.get_count(), bm.get_dest()
+        out = "    " * bm.level
+        out += "[%s] %s -> " % (
+            "*" if count == 0 else f"{count:+}",
+            bm.get_title(),
         )
+        # distinguish between "no dest" and "dest with invalid values" while keeping result machine readable
+        if dest:
+            index, (view_mode, view_pos) = dest.get_index(), dest.get_view()
+            out += "%s  # %s %s" % (
+                index+1 if index != None else "?",
+                pdfium_i.ViewmodeToStr.get(view_mode),
+                round_list(view_pos, args.n_digits),
+            )
+        else:
+            out += "_"
+        print(out)
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index b12296942..629e4ad45 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
-__all__ = ("PdfDocument", "PdfFormEnv", "PdfXObject", "PdfOutlineItem")
+__all__ = ("PdfDocument", "PdfFormEnv", "PdfXObject", "PdfBookmark", "PdfDest")
 
 import os
 import ctypes
@@ -183,7 +183,6 @@ def init_forms(self, config=None):
                 )
     
     
-    # TODO?(v5) consider cached property
     def get_formtype(self):
         """
         Returns:
@@ -193,7 +192,6 @@ def get_formtype(self):
         return pdfium_c.FPDF_GetFormType(self)
     
     
-    # TODO?(v5) consider cached property
     def get_pagemode(self):
         """
         Returns:
@@ -202,7 +200,6 @@ def get_pagemode(self):
         return pdfium_c.FPDFDoc_GetPageMode(self)
     
     
-    # TODO?(v5) consider cached property
     def is_tagged(self):
         """
         Returns:
@@ -355,7 +352,6 @@ def del_attachment(self, index):
             raise PdfiumError(f"Failed to delete attachment at index {index}.")
     
     
-    # TODO deprecate in favour of index access?
     def get_page(self, index):
         """
         Returns:
@@ -398,7 +394,7 @@ def new_page(self, width, height, index=None):
             index = len(self)
         raw_page = pdfium_c.FPDFPage_New(self, index, width, height)
         page = PdfPage(raw_page, self, None)
-        # not doing formenv calls for new pages as we don't see the point
+        # not doing formenv calls for new pages
         self._add_kid(page)
         return page
     
@@ -406,8 +402,9 @@ def new_page(self, width, height, index=None):
     def del_page(self, index):
         """
         Remove the page at *index* (zero-based).
+        It is recommended to close any open handles to the page before deleting it.
         """
-        # FIXME what if the caller still has a handle to the page?
+        # FIXME not sure how pdfium would behave if the caller tries to access a handle to a deleted page...
         pdfium_c.FPDFPage_Delete(self, index)
     
     
@@ -486,42 +483,6 @@ def page_as_xobject(self, index, dest_pdf):
         return xobject
     
     
-    # TODO(apibreak) consider switching to a wrapper class around the raw bookmark
-    # (either with getter methods, or possibly cached properties)
-    def _get_bookmark(self, bookmark, level):
-        
-        n_bytes = pdfium_c.FPDFBookmark_GetTitle(bookmark, None, 0)
-        buffer = ctypes.create_string_buffer(n_bytes)
-        pdfium_c.FPDFBookmark_GetTitle(bookmark, buffer, n_bytes)
-        title = buffer.raw[:n_bytes-2].decode('utf-16-le')
-        
-        # TODO(apibreak) just expose count as-is rather than using two variables and doing extra work
-        count = pdfium_c.FPDFBookmark_GetCount(bookmark)
-        is_closed = True if count < 0 else None if count == 0 else False
-        n_kids = abs(count)
-        
-        dest = pdfium_c.FPDFBookmark_GetDest(self, bookmark)
-        page_index = pdfium_c.FPDFDest_GetDestPageIndex(self, dest)
-        if page_index == -1:
-            page_index = None
-        
-        n_params = ctypes.c_ulong()
-        view_pos = (pdfium_c.FS_FLOAT * 4)()
-        view_mode = pdfium_c.FPDFDest_GetView(dest, n_params, view_pos)
-        view_pos = list(view_pos)[:n_params.value]
-        
-        return PdfOutlineItem(
-            level = level,
-            title = title,
-            is_closed = is_closed,
-            n_kids = n_kids,
-            page_index = page_index,
-            view_mode = view_mode,
-            view_pos = view_pos,
-        )
-    
-    
-    # TODO(apibreak) change outline API (see above)
     def get_toc(
             self,
             max_depth = 15,
@@ -530,39 +491,37 @@ def get_toc(
             seen = None,
         ):
         """
-        Iterate through the bookmarks in the document's table of contents.
+        Iterate through the bookmarks in the document's table of contents (TOC).
         
         Parameters:
             max_depth (int):
                 Maximum recursion depth to consider.
         Yields:
-            :class:`.PdfOutlineItem`: Bookmark information.
+            :class:`.PdfBookmark`
         """
         
         if seen is None:
             seen = set()
         
-        bookmark = pdfium_c.FPDFBookmark_GetFirstChild(self, parent)
+        bm_ptr = pdfium_c.FPDFBookmark_GetFirstChild(self, parent)
         
-        while bookmark:
+        # NOTE We need bool(ptr) here to handle cases where .contents is a null pointer (raises exception on access). Don't use ptr != None, it's always true.
+        while bm_ptr:
             
-            address = ctypes.addressof(bookmark.contents)
+            address = ctypes.addressof(bm_ptr.contents)
             if address in seen:
-                logger.warning("A circular bookmark reference was detected whilst parsing the table of contents.")
+                logger.warning("A circular bookmark reference was detected while traversing the table of contents.")
                 break
             else:
                 seen.add(address)
             
-            yield self._get_bookmark(bookmark, level)
+            yield PdfBookmark(bm_ptr, self, level)
             if level < max_depth-1:
-                yield from self.get_toc(
-                    max_depth = max_depth,
-                    parent = bookmark,
-                    level = level + 1,
-                    seen = seen,
-                )
+                yield from self.get_toc(max_depth=max_depth, parent=bm_ptr, level=level+1, seen=seen)
+            elif pdfium_c.FPDFBookmark_GetFirstChild(self, bm_ptr):
+                logger.warning(f"Maximum recursion depth {max_depth} reached. Children beyond this scope are ignored.")
             
-            bookmark = pdfium_c.FPDFBookmark_GetNextSibling(self, bookmark)
+            bm_ptr = pdfium_c.FPDFBookmark_GetNextSibling(self, bm_ptr)
     
     
     def render(
@@ -681,28 +640,81 @@ def _open_pdf(input_data, password, autoclose):
     return pdf, to_hold, to_close
 
 
-# TODO(apibreak) change outline API (see above)
-PdfOutlineItem = namedtuple("PdfOutlineItem", "level title is_closed n_kids page_index view_mode view_pos")
-"""
-Bookmark information.
+class PdfBookmark (pdfium_i.AutoCastable):
+    """
+    Bookmark helper class.
+    
+    Attributes:
+        raw (FPDF_BOOKMARK):
+            The underlying PDFium bookmark handle.
+        pdf (PdfDocument):
+            Reference to the document this bookmark belongs to.
+        level (int):
+            The bookmark's nesting level in the TOC tree. Corresponds to the number of parent bookmarks.
+    """
+    
+    def __init__(self, raw, pdf, level):
+        self.raw, self.pdf, self.level = raw, pdf, level
+    
+    def get_title(self):
+        """
+        Returns:
+            str: The bookmark's title string.
+        """
+        n_bytes = pdfium_c.FPDFBookmark_GetTitle(self, None, 0)
+        buffer = ctypes.create_string_buffer(n_bytes)
+        pdfium_c.FPDFBookmark_GetTitle(self, buffer, n_bytes)
+        return buffer.raw[:n_bytes-2].decode("utf-16-le")
+    
+    def get_count(self):
+        """
+        Returns:
+            int: Signed number of child bookmarks (fully recursive). Zero if the bookmark has no descendants.
+            The initial state shall be closed (collapsed) if negative, open (expanded) if positive.
+        """
+        return pdfium_c.FPDFBookmark_GetCount(self)
+    
+    def get_dest(self):
+        """
+        Returns:
+            PdfDest | None: The bookmark's destination (page index, viewport), or None on failure.
+        """
+        raw_dest = pdfium_c.FPDFBookmark_GetDest(self.pdf, self)
+        if not raw_dest:
+            return None
+        return PdfDest(raw_dest, pdf=self.pdf)
+
 
-Parameters:
-    level (int):
-        Number of parent items.
-    title (str):
-        Title string of the bookmark.
-    is_closed (bool):
-        True if child items shall be collapsed, False if they shall be expanded.
-        None if the item has no descendants (i. e. ``n_kids == 0``).
-    n_kids (int):
-        Absolute number of child items, according to the PDF.
-    page_index (int | None):
-        Zero-based index of the page the bookmark points to.
-        May be None if the bookmark has no target page (or it could not be determined).
-    view_mode (int):
-        A view mode constant (:data:`PDFDEST_VIEW_*`) defining how the coordinates of *view_pos* shall be interpreted.
-    view_pos (list[float]):
-        Target position on the page the viewport should jump to when the bookmark is clicked.
-        It is a sequence of :class:`float` values in PDF canvas units.
-        Depending on *view_mode*, it may contain between 0 and 4 coordinates.
-"""
+class PdfDest (pdfium_i.AutoCastable):
+    """
+    Destination helper class.
+    
+    Attributes:
+        raw (FPDF_DEST): The underlying PDFium destination handle.
+        pdf (PdfDocument): Reference to the document this dest belongs to.
+    """
+    
+    def __init__(self, raw, pdf):
+        self.raw, self.pdf = raw, pdf
+    
+    def get_index(self):
+        """
+        Returns:
+            int | None: Zero-based index of the page the dest points to, or None on failure.
+        """
+        val = pdfium_c.FPDFDest_GetDestPageIndex(self.pdf, self)
+        return val if val >= 0 else None
+    
+    def get_view(self):
+        """
+        Returns:
+            (int, list[float]): A tuple of (view_mode, view_pos).
+            *view_mode* is a constant (one of :data:`PDFDEST_VIEW_*`) defining how *view_pos* shall be interpreted.
+            *view_pos* is the target position on the page the dest points to.
+            It may contain between 0 to 4 float coordinates, depending on the view mode.
+        """
+        n_params = ctypes.c_ulong()
+        pos = (pdfium_c.FS_FLOAT * 4)()
+        mode = pdfium_c.FPDFDest_GetView(self, n_params, pos)
+        pos = list(pos)[:n_params.value]
+        return (mode, pos)

From 0183d80a4cdc03356dae34c17ab8834d3c227d0c Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 4 Apr 2024 17:21:44 +0200
Subject: [PATCH 002/140] toc: update API test

---
 tests/test_toc.py | 48 ++++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/tests/test_toc.py b/tests/test_toc.py
index a1b54a1db..b8e0869a5 100644
--- a/tests/test_toc.py
+++ b/tests/test_toc.py
@@ -8,10 +8,19 @@
 from .conftest import TestResources
 
 
-def _compare_bookmark(bookmark, view_pos, **kwargs):
-    for name, exp_value in kwargs.items():
-        assert exp_value == getattr(bookmark, name)
-    assert pytest.approx(bookmark.view_pos, abs=1) == view_pos
+def _compare_bookmark(bm, **kwargs):
+    assert isinstance(bm, pdfium.PdfBookmark)
+    assert kwargs["title"] == bm.get_title()
+    assert kwargs["count"] == bm.get_count()
+    dest = bm.get_dest()
+    if dest is None:
+        assert kwargs["dest"] is None
+    else:
+        assert isinstance(dest, pdfium.PdfDest)
+        assert kwargs["page_index"] == dest.get_index()
+        view_mode, view_pos = dest.get_view()
+        assert kwargs["view_mode"] == view_mode
+        assert kwargs["view_pos"] == pytest.approx(view_pos, abs=1)
 
 
 def test_gettoc():
@@ -26,25 +35,24 @@ def test_gettoc():
         page_index = 0,
         view_mode = pdfium_c.PDFDEST_VIEW_XYZ,
         view_pos = (89, 758, 0),
-        is_closed = True,
-        n_kids = 2,
+        count = -2,
     )
     
     # check common values
-    for bookmark in toc:
-        assert isinstance(bookmark, pdfium.PdfOutlineItem)
-        assert bookmark.view_mode is pdfium_c.PDFDEST_VIEW_XYZ
-        assert round(bookmark.view_pos[0]) == 89
+    for bm in toc:
+        dest = bm.get_dest()
+        view_mode, view_pos = dest.get_view()
+        assert view_mode is pdfium_c.PDFDEST_VIEW_XYZ
+        assert round(view_pos[0]) == 89
     
     # check last bookmark
     _compare_bookmark(
-        bookmark,
+        bm,
         title = "Three-B",
         page_index = 1,
         view_mode = pdfium_c.PDFDEST_VIEW_XYZ,
         view_pos = (89, 657, 0),
-        is_closed = None,
-        n_kids = 0,
+        count = 0,
     )
 
 
@@ -56,20 +64,14 @@ def test_gettoc_circular(caplog):
     _compare_bookmark(
         next(toc),
         title = "A Good Beginning",
-        page_index = None,
-        view_mode = pdfium_c.PDFDEST_VIEW_UNKNOWN_MODE,
-        view_pos = [],
-        is_closed = None,
-        n_kids = 0,
+        dest = None,
+        count = 0,
     )
     _compare_bookmark(
         next(toc),
         title = "A Good Ending",
-        page_index = None,
-        view_mode = pdfium_c.PDFDEST_VIEW_UNKNOWN_MODE,
-        view_pos = [],
-        is_closed = None,
-        n_kids = 0,
+        dest = None,
+        count = 0,
     )
     with caplog.at_level(logging.WARNING):
         for other in toc: pass

From d699a6ced30fdc9e866f7e9613a81882e507da4e Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 4 Apr 2024 17:40:33 +0200
Subject: [PATCH 003/140] test_cli: also capture stderr/logging

Note the following test script:
```
import io
import sys
import logging
import contextlib

logger = logging.getLogger("testLogger")
logger.setLevel(logging.DEBUG)

buf = io.StringIO()
logger.addHandler(logging.StreamHandler(buf))  # !

with contextlib.redirect_stdout(buf), contextlib.redirect_stderr(buf):
    print("print to stdout")
    print("print to stderr", file=sys.stderr)
    logger.info("info message")
    logger.warning("warning message")

print(f"{buf.getvalue()!r}")
```

Like this, we get:
> 'print to stdout\nprint to stderr\ninfo message\nwarning message\n'
Without handler:
> 'print to stdout\nprint to stderr\nwarning message\n'
With default handler:
> info message
> warning message
> 'print to stdout\nprint to stderr\n'

Weird.
---
 tests/test_cli.py | 50 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 2b64dfd93..4c05bf5d1 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
 import io
+import logging
 import filecmp
 import contextlib
 from pathlib import Path
@@ -11,28 +12,57 @@
 import pypdfium2.__main__ as pdfium_cli
 from .conftest import TestResources, TestExpectations
 
+lib_logger = logging.getLogger("pypdfium2")
 
-def run_cli(argv, exp_stdout=None, normalize_lfs=False):
+@contextlib.contextmanager
+def logging_capture_handler(buf):
+    orig_handlers = lib_logger.handlers
+    lib_logger.handlers = []
+    handler = logging.StreamHandler(buf)
+    lib_logger.addHandler(handler)
+    yield
+    lib_logger.removeHandler(handler)
+    lib_logger.handlers = orig_handlers
+
+
+@contextlib.contextmanager
+def joined_ctx(ctxes):
+    with contextlib.ExitStack() as stack:
+        for ctx in ctxes: stack.enter_context(ctx)
+        yield
+
+
+def run_cli(argv, exp_output=None, capture=("out", "err", "log"), normalize_lfs=False):
     
     argv = [str(a) for a in argv]
     
-    if exp_stdout is None:
+    if exp_output is None:
         pdfium_cli.api_main(argv)
         
     else:
         
-        stdout_buf = io.StringIO()
-        with contextlib.redirect_stdout(stdout_buf):
+        output = io.StringIO()
+        ctxes = []
+        assert isinstance(capture, (tuple, list))
+        if "out" in capture:
+            ctxes += [contextlib.redirect_stdout(output)]
+        if "err" in capture:
+            ctxes += [contextlib.redirect_stderr(output)]
+        # for some reason, logging doesn't seem to go the usual stdout/stderr path, so explicitly install a stream handler to capture
+        if "log" in capture:
+            ctxes += [logging_capture_handler(output)]
+        assert len(ctxes) >= 1
+        with joined_ctx(ctxes):
             pdfium_cli.api_main(argv)
         
-        if isinstance(exp_stdout, Path):
-            exp_stdout = exp_stdout.read_text()
+        if isinstance(exp_output, Path):
+            exp_output = exp_output.read_text()
         
-        stdout = stdout_buf.getvalue()
+        output = output.getvalue()
         if normalize_lfs:
-            stdout = stdout.replace("\r\n", "\n")
+            output = output.replace("\r\n", "\n")
         
-        assert stdout == exp_stdout
+        assert output == exp_output
 
 
 def _get_files(dir):
@@ -57,7 +87,7 @@ def test_attachments(tmp_path):
     
     edited_pdf = tmp_path / "edited.pdf"
     run_cli(["attachments", TestResources.attachments, "edit", "--del-numbers", "1,2", "--add-files", TestResources.mona_lisa, "-o", edited_pdf])
-    run_cli(["attachments", edited_pdf, "list"], "[1] mona_lisa.jpg\n")
+    run_cli(["attachments", edited_pdf, "list"], "[1] mona_lisa.jpg\n", capture=["out"])
 
 
 def test_images(tmp_path):

From 8d0d36fcd20e491d40835d40a631a714c8b59ff7 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 4 Apr 2024 17:40:59 +0200
Subject: [PATCH 004/140] Update test expectations

---
 tests/expectations/attachments_list.txt    |  1 +
 tests/expectations/pdfinfo_attachments.txt |  1 +
 tests/expectations/toc.txt                 |  6 ++--
 tests/expectations/toc_circular.txt        |  5 +--
 tests/expectations/toc_maxdepth.txt        | 37 +++++++++++-----------
 5 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/tests/expectations/attachments_list.txt b/tests/expectations/attachments_list.txt
index 539fe1d54..92641d8d9 100644
--- a/tests/expectations/attachments_list.txt
+++ b/tests/expectations/attachments_list.txt
@@ -1,2 +1,3 @@
+Unsupported PDF feature: Attachment (incomplete support)
 [1] 1.txt
 [2] attached.pdf
diff --git a/tests/expectations/pdfinfo_attachments.txt b/tests/expectations/pdfinfo_attachments.txt
index 898f35b9c..1bfbeb373 100644
--- a/tests/expectations/pdfinfo_attachments.txt
+++ b/tests/expectations/pdfinfo_attachments.txt
@@ -1,3 +1,4 @@
+Unsupported PDF feature: Attachment (incomplete support)
 Page Count: 1
 PDF Version: 1.6
 ID (permanent): b'\xd8\x89\xebk\x9a\xdf\x88\xe5\xed\xa7\xdc\x08\xfe\x85\x97'
diff --git a/tests/expectations/toc.txt b/tests/expectations/toc.txt
index b635d42a4..bd6aa6b50 100644
--- a/tests/expectations/toc.txt
+++ b/tests/expectations/toc.txt
@@ -1,9 +1,9 @@
-[-] One -> 1  # XYZ [89.29, 757.7, 0.0]
+[-2] One -> 1  # XYZ [89.29, 757.7, 0.0]
     [*] One-A -> 1  # XYZ [89.29, 706.86, 0.0]
-    [-] One-B -> 1  # XYZ [89.29, 657.03, 0.0]
+    [-2] One-B -> 1  # XYZ [89.29, 657.03, 0.0]
         [*] One-B-I -> 1  # XYZ [89.29, 607.2, 0.0]
         [*] One-B-II -> 1  # XYZ [89.29, 557.76, 0.0]
 [*] Two -> 1  # XYZ [89.29, 507.16, 0.0]
-[-] Three -> 2  # XYZ [89.29, 757.7, 0.0]
+[-2] Three -> 2  # XYZ [89.29, 757.7, 0.0]
     [*] Three-A -> 2  # XYZ [89.29, 706.98, 0.0]
     [*] Three-B -> 2  # XYZ [89.29, 657.15, 0.0]
diff --git a/tests/expectations/toc_circular.txt b/tests/expectations/toc_circular.txt
index 15142248c..984920d7f 100644
--- a/tests/expectations/toc_circular.txt
+++ b/tests/expectations/toc_circular.txt
@@ -1,2 +1,3 @@
-[*] A Good Beginning -> ?  # ? []
-[*] A Good Ending -> ?  # ? []
+[*] A Good Beginning -> _
+[*] A Good Ending -> _
+A circular bookmark reference was detected while traversing the table of contents.
diff --git a/tests/expectations/toc_maxdepth.txt b/tests/expectations/toc_maxdepth.txt
index 814bfceea..47a04509a 100644
--- a/tests/expectations/toc_maxdepth.txt
+++ b/tests/expectations/toc_maxdepth.txt
@@ -1,20 +1,21 @@
-[+] 1.outline -> 1  # FitH [746.439]
-    [+] 1.1.outline -> 1  # FitH [700.878]
-        [+] 1.1.1.outline -> 1  # FitH [632.537]
-            [+] 1.1.1.1.outline -> 1  # FitH [632.946]
-                [+] 1.1.1.1.1.outline -> 1  # FitH [597.304]
-                    [+] 1.1.1.1.1.1outline -> 1  # FitH [632.946]
-                        [+] 1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                            [+] 1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                [+] 1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                    [+] 1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                        [+] 1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                            [+] 1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                                [+] 1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                                    [+] 1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                                        [+] 1.1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-[+] 2.outline -> 2  # FitH [749.4771]
-    [+] 2.1.outline -> 2  # FitH [699.36]
-        [+] 2.1.1.outline -> 2  # FitH [628.74]
+[+100] 1.outline -> 1  # FitH [746.439]
+    [+100] 1.1.outline -> 1  # FitH [700.878]
+        [+1] 1.1.1.outline -> 1  # FitH [632.537]
+            [+1] 1.1.1.1.outline -> 1  # FitH [632.946]
+                [+1] 1.1.1.1.1.outline -> 1  # FitH [597.304]
+                    [+1] 1.1.1.1.1.1outline -> 1  # FitH [632.946]
+                        [+1] 1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                            [+1] 1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                [+1] 1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                    [+1] 1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                        [+1] 1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                            [+1] 1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                                [+1] 1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                                    [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                                        [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+Maximum recursion depth 15 reached. Children beyond this scope are ignored.
+[+100] 2.outline -> 2  # FitH [749.4771]
+    [+100] 2.1.outline -> 2  # FitH [699.36]
+        [+100] 2.1.1.outline -> 2  # FitH [628.74]
             [*] 2.1.1.1.outline -> 2  # FitH [583.179]
     [*] 2.2 outline -> 2  # FitH [515.218]

From 847281ceac30e10f1f7c8bf7bf8dfc92b6c05f83 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 4 Apr 2024 17:59:00 +0200
Subject: [PATCH 005/140] toc: better explain level == maxdepth scenario

---
 src/pypdfium2/_helpers/document.py  | 3 ++-
 tests/expectations/toc_maxdepth.txt | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 629e4ad45..80fad8cbb 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -519,7 +519,8 @@ def get_toc(
             if level < max_depth-1:
                 yield from self.get_toc(max_depth=max_depth, parent=bm_ptr, level=level+1, seen=seen)
             elif pdfium_c.FPDFBookmark_GetFirstChild(self, bm_ptr):
-                logger.warning(f"Maximum recursion depth {max_depth} reached. Children beyond this scope are ignored.")
+                # Warn only if there actually is a subtree. If level == max_depth but the tree ends there, it's fine as no information is skipped.
+                logger.warning(f"Maximum recursion depth {max_depth} reached (subtree skipped).")
             
             bm_ptr = pdfium_c.FPDFBookmark_GetNextSibling(self, bm_ptr)
     
diff --git a/tests/expectations/toc_maxdepth.txt b/tests/expectations/toc_maxdepth.txt
index 47a04509a..711731985 100644
--- a/tests/expectations/toc_maxdepth.txt
+++ b/tests/expectations/toc_maxdepth.txt
@@ -13,7 +13,7 @@
                                                 [+1] 1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
                                                     [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
                                                         [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-Maximum recursion depth 15 reached. Children beyond this scope are ignored.
+Maximum recursion depth 15 reached (subtree skipped).
 [+100] 2.outline -> 2  # FitH [749.4771]
     [+100] 2.1.outline -> 2  # FitH [699.36]
         [+100] 2.1.1.outline -> 2  # FitH [628.74]

From f2352263a2a1a39c8e10b3c18b57c3f181e7963a Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 4 Apr 2024 18:06:45 +0200
Subject: [PATCH 006/140] Start tracking changes

---
 docs/devel/changelog_staging.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index e41edd85a..9abd31065 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -4,3 +4,9 @@
 <!-- List character: dash (-) -->
 
 # Changelog for next release
+- PdfDocument.get_toc(): Replaced bookmark namedtuple `PdfOutlineItem` with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Also provide signed count as-is rather than needlessly splitting it in two variables (unsigned int `n_kids` and bool `is_closed`).
+
+<!-- TODO
+See https://github.com/pypdfium2-team/pypdfium2/blob/devel_old/docs/devel/changelog_staging.md
+for how to proceed. Note that some things are already done, and some rejected, though.
+-->

From 4bfb46144195ba1a54921b59de92365e8fb88f9d Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 4 Apr 2024 18:10:43 +0200
Subject: [PATCH 007/140] slightly improve docs for get_count()

---
 src/pypdfium2/_helpers/document.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 80fad8cbb..2f9882b61 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -670,7 +670,7 @@ def get_title(self):
     def get_count(self):
         """
         Returns:
-            int: Signed number of child bookmarks (fully recursive). Zero if the bookmark has no descendants.
+            int: Signed number of child bookmarks, recursively counting all members in the subtree. Zero if the bookmark has no descendants.
             The initial state shall be closed (collapsed) if negative, open (expanded) if positive.
         """
         return pdfium_c.FPDFBookmark_GetCount(self)

From ac7903f3d6793878da8eca1db45307e66fdcc43b Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 4 Apr 2024 21:31:07 +0200
Subject: [PATCH 008/140] address various nits

---
 docs/devel/changelog_staging.md    | 4 ++--
 src/pypdfium2/_cli/toc.py          | 4 ++--
 src/pypdfium2/_helpers/document.py | 8 ++++----
 tests/test_toc.py                  | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 9abd31065..2e1e10693 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -4,9 +4,9 @@
 <!-- List character: dash (-) -->
 
 # Changelog for next release
-- PdfDocument.get_toc(): Replaced bookmark namedtuple `PdfOutlineItem` with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Also provide signed count as-is rather than needlessly splitting it in two variables (unsigned int `n_kids` and bool `is_closed`).
+- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Also provide signed count as-is rather than needlessly splitting in two variables (unsigned int `n_kids` and bool `is_closed`).
 
 <!-- TODO
 See https://github.com/pypdfium2-team/pypdfium2/blob/devel_old/docs/devel/changelog_staging.md
-for how to proceed. Note that some things are already done, and some rejected, though.
+for how to proceed. Note that some things are already done, and some rejected.
 -->
diff --git a/src/pypdfium2/_cli/toc.py b/src/pypdfium2/_cli/toc.py
index f05f50d6c..c19fe8749 100644
--- a/src/pypdfium2/_cli/toc.py
+++ b/src/pypdfium2/_cli/toc.py
@@ -31,10 +31,10 @@ def main(args):
         count, dest = bm.get_count(), bm.get_dest()
         out = "    " * bm.level
         out += "[%s] %s -> " % (
-            "*" if count == 0 else f"{count:+}",
+            f"{count:+}" if count != 0 else "*",
             bm.get_title(),
         )
-        # distinguish between "no dest" and "dest with invalid values" while keeping result machine readable
+        # distinguish between "no dest" and "dest with unknown mode" while keeping result machine readable
         if dest:
             index, (view_mode, view_pos) = dest.get_index(), dest.get_view()
             out += "%s  # %s %s" % (
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 2f9882b61..1ba968e9a 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -402,7 +402,7 @@ def new_page(self, width, height, index=None):
     def del_page(self, index):
         """
         Remove the page at *index* (zero-based).
-        It is recommended to close any open handles to the page before deleting it.
+        It is recommended to close any open handles to the page before calling this method.
         """
         # FIXME not sure how pdfium would behave if the caller tries to access a handle to a deleted page...
         pdfium_c.FPDFPage_Delete(self, index)
@@ -519,7 +519,7 @@ def get_toc(
             if level < max_depth-1:
                 yield from self.get_toc(max_depth=max_depth, parent=bm_ptr, level=level+1, seen=seen)
             elif pdfium_c.FPDFBookmark_GetFirstChild(self, bm_ptr):
-                # Warn only if there actually is a subtree. If level == max_depth but the tree ends there, it's fine as no information is skipped.
+                # Warn only if there actually is a subtree. If level == max_depth but the tree ends there, it's fine as no info is skipped.
                 logger.warning(f"Maximum recursion depth {max_depth} reached (subtree skipped).")
             
             bm_ptr = pdfium_c.FPDFBookmark_GetNextSibling(self, bm_ptr)
@@ -606,7 +606,7 @@ def as_pageobject(self):
         Returns:
             PdfObject: An independent page object representation of the XObject.
             If multiple page objects are created from one XObject, they share resources.
-            Page objects created from an XObject remain valid after the XObject is closed.
+            Pageobjects created from an XObject remain valid after the XObject is closed.
         """
         raw_pageobj = pdfium_c.FPDF_NewFormObjectFromXObject(self)
         return PdfObject(  # not a child object (see above)
@@ -651,7 +651,7 @@ class PdfBookmark (pdfium_i.AutoCastable):
         pdf (PdfDocument):
             Reference to the document this bookmark belongs to.
         level (int):
-            The bookmark's nesting level in the TOC tree. Corresponds to the number of parent bookmarks.
+            The bookmark's nesting level in the TOC tree (zero-based). Corresponds to the number of parent bookmarks.
     """
     
     def __init__(self, raw, pdf, level):
diff --git a/tests/test_toc.py b/tests/test_toc.py
index b8e0869a5..a732822a3 100644
--- a/tests/test_toc.py
+++ b/tests/test_toc.py
@@ -42,7 +42,7 @@ def test_gettoc():
     for bm in toc:
         dest = bm.get_dest()
         view_mode, view_pos = dest.get_view()
-        assert view_mode is pdfium_c.PDFDEST_VIEW_XYZ
+        assert view_mode == pdfium_c.PDFDEST_VIEW_XYZ
         assert round(view_pos[0]) == 89
     
     # check last bookmark

From 517630a9dc222eae144d85a468795beeff3db4dd Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 4 Apr 2024 22:20:09 +0200
Subject: [PATCH 009/140] Continue on document and bitmap

Removed PdfDocument.render() & PdfBitmapInfo.
Implemented context manager support for PdfDocument.

Test suite integration TBD.
---
 docs/devel/changelog_staging.md    |  11 ++-
 src/pypdfium2/_cli/toc.py          |   2 +-
 src/pypdfium2/_helpers/bitmap.py   | 113 +++++++++++------------------
 src/pypdfium2/_helpers/document.py |  47 ++++--------
 4 files changed, 64 insertions(+), 109 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 2e1e10693..03efeb76b 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -4,9 +4,16 @@
 <!-- List character: dash (-) -->
 
 # Changelog for next release
-- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Also provide signed count as-is rather than needlessly splitting in two variables (unsigned int `n_kids` and bool `is_closed`).
+
+*API-breaking changes*
+- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also, distinguish between `dest == None` and an empty dest.
+- Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog).
+- Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`.
+
+*Improvements and new features*
+- Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
 
 <!-- TODO
 See https://github.com/pypdfium2-team/pypdfium2/blob/devel_old/docs/devel/changelog_staging.md
-for how to proceed. Note that some things are already done, and some rejected.
+for how to proceed. Note that some things have already been backported, and some rejected.
 -->
diff --git a/src/pypdfium2/_cli/toc.py b/src/pypdfium2/_cli/toc.py
index c19fe8749..5425a33ea 100644
--- a/src/pypdfium2/_cli/toc.py
+++ b/src/pypdfium2/_cli/toc.py
@@ -34,7 +34,7 @@ def main(args):
             f"{count:+}" if count != 0 else "*",
             bm.get_title(),
         )
-        # distinguish between "no dest" and "dest with unknown mode" while keeping result machine readable
+        # distinguish between "dest == None" and "dest with unknown mode" while keeping the output machine readable
         if dest:
             index, (view_mode, view_pos) = dest.get_index(), dest.get_view()
             out += "%s  # %s %s" % (
diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index 3fe4c1e30..b1de99244 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -1,12 +1,10 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
-__all__ = ("PdfBitmap", "PdfBitmapInfo")
+__all__ = ("PdfBitmap", )
 
 import ctypes
 import logging
-import weakref
-from collections import namedtuple
 import pypdfium2.raw as pdfium_c
 import pypdfium2.internal as pdfium_i
 from pypdfium2._helpers.misc import PdfiumError
@@ -28,16 +26,8 @@ class PdfBitmap (pdfium_i.AutoCloseable):
     """
     Bitmap helper class.
     
-    Hint:
-        This class provides built-in converters (e. g. :meth:`.to_pil`, :meth:`.to_numpy`) that may be used to create a different representation of the bitmap.
-        Converters can be applied on :class:`.PdfBitmap` objects either as bound method (``bitmap.to_*()``), or as function (``PdfBitmap.to_*(bitmap)``)
-        The second pattern is useful for API methods that need to apply a caller-provided converter (e. g. :meth:`.PdfDocument.render`)
-    
     .. _PIL Modes: https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes
     
-    Note:
-        All attributes of :class:`.PdfBitmapInfo` are available in this class as well.
-    
     Warning:
         ``bitmap.close()``, which frees the buffer of foreign bitmaps, is not validated for safety.
         A bitmap must not be closed when other objects still depend on its buffer!
@@ -47,13 +37,36 @@ class PdfBitmap (pdfium_i.AutoCloseable):
             The underlying PDFium bitmap handle.
         buffer (~ctypes.c_ubyte):
             A ctypes array representation of the pixel data (each item is an unsigned byte, i. e. a number ranging from 0 to 255).
+        width (int):
+            Width of the bitmap (horizontal size).
+        height (int):
+            Height of the bitmap (vertical size).
+        stride (int):
+            Number of bytes per line in the bitmap buffer.
+            Depending on how the bitmap was created, there may be a padding of unused bytes at the end of each line, so this value can be greater than ``width * n_channels``.
+        format (int):
+            PDFium bitmap format constant (:attr:`FPDFBitmap_*`)
+        rev_byteorder (bool):
+            Whether the bitmap is using reverse byte order.
+        n_channels (int):
+            Number of channels per pixel.
+        mode (str):
+            The bitmap format as string (see `PIL Modes`_).
     """
     
     def __init__(self, raw, buffer, width, height, stride, format, rev_byteorder, needs_free):
-        self.raw, self.buffer, self.width, self.height = raw, buffer, width, height
-        self.stride, self.format, self.rev_byteorder = stride, format, rev_byteorder
+        self.raw = raw
+        self.buffer = buffer
+        self.width = width
+        self.height = height
+        self.stride = stride
+        self.format = format
+        self.rev_byteorder = rev_byteorder
         self.n_channels = pdfium_i.BitmapTypeToNChannels[self.format]
-        self.mode = (pdfium_i.BitmapTypeToStrReverse if self.rev_byteorder else pdfium_i.BitmapTypeToStr)[self.format]
+        self.mode = {
+            False: pdfium_i.BitmapTypeToStr,
+            True: pdfium_i.BitmapTypeToStrReverse,
+        }[self.rev_byteorder][self.format]
         super().__init__(pdfium_c.FPDFBitmap_Destroy, needs_free=needs_free, obj=self.buffer)
     
     
@@ -61,18 +74,6 @@ def __init__(self, raw, buffer, width, height, stride, format, rev_byteorder, ne
     def parent(self):  # AutoCloseable hook
         return None
     
-    
-    def get_info(self):
-        """
-        Returns:
-            PdfBitmapInfo: A namedtuple describing the bitmap.
-        """
-        return PdfBitmapInfo(
-            width=self.width, height=self.height, stride=self.stride, format=self.format,
-            rev_byteorder=self.rev_byteorder, n_channels=self.n_channels, mode=self.mode,
-        )
-    
-    
     @classmethod
     def from_raw(cls, raw, rev_byteorder=False, ex_buffer=None):
         """
@@ -95,7 +96,7 @@ def from_raw(cls, raw, rev_byteorder=False, ex_buffer=None):
         if ex_buffer is None:
             needs_free = True
             buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(raw)
-            if buffer_ptr is None:
+            if not buffer_ptr:
                 raise PdfiumError("Failed to get bitmap buffer (null pointer returned)")
             buffer = ctypes.cast(buffer_ptr, ctypes.POINTER(ctypes.c_ubyte * (stride * height))).contents
         else:
@@ -108,15 +109,15 @@ def from_raw(cls, raw, rev_byteorder=False, ex_buffer=None):
         )
     
     
-    # TODO support setting stride if external buffer is provided
     @classmethod
-    def new_native(cls, width, height, format, rev_byteorder=False, buffer=None):
+    def new_native(cls, width, height, format, rev_byteorder=False, buffer=None, stride=None):
         """
         Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by Python/ctypes.
         Bitmaps created by this function are always packed (no unused bytes at line end).
         """
         
-        stride = width * pdfium_i.BitmapTypeToNChannels[format]
+        if stride is None:
+            stride = width * pdfium_i.BitmapTypeToNChannels[format]
         if buffer is None:
             buffer = (ctypes.c_ubyte * (stride * height))()
         raw = pdfium_c.FPDFBitmap_CreateEx(width, height, format, buffer, stride)
@@ -211,14 +212,12 @@ def to_pil(self):
         """
         Convert the bitmap to a :mod:`PIL` image, using :func:`PIL.Image.frombuffer`.
         
-        For ``RGBA``, ``RGBX`` and ``L`` buffers, PIL is supposed to share memory with
-        the original bitmap buffer, so changes to the buffer should be reflected in the image, and vice versa.
+        For ``RGBA``, ``RGBX`` and ``L`` bitmaps, PIL is supposed to share memory with
+        the original buffer, so changes to the buffer should be reflected in the image, and vice versa.
         Otherwise, PIL will make a copy of the data.
         
         Returns:
             PIL.Image.Image: PIL image (representation or copy of the bitmap buffer).
-        
-        .. versionchanged:: 4.16 Set ``image.readonly = False`` so that changes to the image are also reflected in the buffer.
         """
         
         # https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.frombuffer
@@ -234,45 +233,38 @@ def to_pil(self):
             self.stride,                # bytes per line
             1,                          # orientation (top->bottom)
         )
+        # set `readonly = False` so changes to the image are reflected in the buffer, if the original buffer is used
         image.readonly = False
         
         return image
     
     
     @classmethod
-    def from_pil(cls, pil_image, recopy=False):
+    def from_pil(cls, pil_image):
         """
         Convert a :mod:`PIL` image to a PDFium bitmap.
-        Due to the restricted number of color formats and bit depths supported by PDFium's
-        bitmap implementation, this may be a lossy operation.
+        Due to the restricted number of color formats and bit depths supported by FPDF_BITMAP, this may be a lossy operation.
         
-        Bitmaps returned by this function should be treated as immutable (i.e. don't call :meth:`.fill_rect`).
+        Bitmaps returned by this function should be treated as immutable.
         
         Parameters:
             pil_image (PIL.Image.Image):
                 The image.
         Returns:
             PdfBitmap: PDFium bitmap (with a copy of the PIL image's data).
-        
-        .. deprecated:: 4.25
-           The *recopy* parameter has been deprecated.
         """
         
+        # FIXME possibility to get mutable buffer from PIL image?
+        
         if pil_image.mode in pdfium_i.BitmapStrToConst:
-            # PIL always seems to represent BGR(A/X) input as RGB(A/X), so this code passage is probably only hit for L
+            # PIL always seems to represent BGR(A/X) input as RGB(A/X), so this code passage would only be reached for L
             format = pdfium_i.BitmapStrToConst[pil_image.mode]
         else:
             pil_image = _pil_convert_for_pdfium(pil_image)
             format = pdfium_i.BitmapStrReverseToConst[pil_image.mode]
         
-        py_buffer = pil_image.tobytes()
-        if recopy:
-            buffer = (ctypes.c_ubyte * len(py_buffer)).from_buffer_copy(py_buffer)
-        else:
-            buffer = py_buffer
-        
         w, h = pil_image.size
-        return cls.new_native(w, h, format, rev_byteorder=False, buffer=buffer)
+        return cls.new_native(w, h, format, rev_byteorder=False, buffer=pil_image.tobytes())
     
     
     # TODO implement from_numpy()
@@ -280,8 +272,6 @@ def from_pil(cls, pil_image, recopy=False):
 
 def _pil_convert_for_pdfium(pil_image):
     
-    # FIXME? convoluted / hard to understand; improve control flow
-    
     if pil_image.mode == "1":
         pil_image = pil_image.convert("L")
     elif pil_image.mode.startswith("RGB"):
@@ -304,24 +294,3 @@ def _pil_convert_for_pdfium(pil_image):
         pil_image = PIL.Image.merge("RGBX", (b, g, r, x))
     
     return pil_image
-
-
-PdfBitmapInfo = namedtuple("PdfBitmapInfo", "width height stride format rev_byteorder n_channels mode")
-"""
-Attributes:
-    width (int):
-        Width of the bitmap (horizontal size).
-    height (int):
-        Height of the bitmap (vertical size).
-    stride (int):
-        Number of bytes per line in the bitmap buffer.
-        Depending on how the bitmap was created, there may be a padding of unused bytes at the end of each line, so this value can be greater than ``width * n_channels``.
-    format (int):
-        PDFium bitmap format constant (:attr:`FPDFBitmap_*`)
-    rev_byteorder (bool):
-        Whether the bitmap is using reverse byte order.
-    n_channels (int):
-        Number of channels per pixel.
-    mode (str):
-        The bitmap format as string (see `PIL Modes`_).
-"""
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 1ba968e9a..2cb8b7e3f 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -42,9 +42,11 @@ class PdfDocument (pdfium_i.AutoCloseable):
         FileNotFoundError: Raised if an invalid or non-existent file path was given.
     
     Hint:
+        * Documents may be used in a ``with``-block, closing the document on context manager exit.
+          This is recommended when *input_data* is a file path, to safely and immediately release the opened file handle.
         * :func:`len` may be called to get a document's number of pages.
-        * Looping over a document will yield its pages from beginning to end.
         * Pages may be loaded using list index access.
+        * Looping over a document will yield its pages from beginning to end.
         * The ``del`` keyword and list index access may be used to delete pages.
     
     Attributes:
@@ -68,8 +70,6 @@ def __init__(self, input, password=None, autoclose=False):
         self._autoclose = autoclose
         self._data_holder = []
         self._data_closer = []
-        
-        # question: can we make attributes like formenv effectively immutable for the caller?
         self.formenv = None
         
         if isinstance(self._input, pdfium_c.FPDF_DOCUMENT):
@@ -82,6 +82,16 @@ def __init__(self, input, password=None, autoclose=False):
         super().__init__(PdfDocument._close_impl, self._data_holder, self._data_closer)
     
     
+    # Support using PdfDocument in a with-block
+    # Note that pdfium objects have to be closed in hierarchial order, but as this is ensured by the parents/kids system, callers don't have to worry about that.
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, *_):
+        self.close()
+    
+    
     def __repr__(self):
         if isinstance(self._input, Path):
             input_r = repr( str(self._input) )
@@ -523,37 +533,6 @@ def get_toc(
                 logger.warning(f"Maximum recursion depth {max_depth} reached (subtree skipped).")
             
             bm_ptr = pdfium_c.FPDFBookmark_GetNextSibling(self, bm_ptr)
-    
-    
-    def render(
-            self,
-            converter,
-            renderer = PdfPage.render,
-            page_indices = None,
-            pass_info = False,
-            n_processes = None,    # ignored, retained for compat
-            mk_formconfig = None,  # ignored, retained for compat
-            **kwargs
-        ):
-        """
-        .. deprecated:: 4.19
-           This method will be removed with the next major release due to serious issues rooted in the original API design. Use :meth:`.PdfPage.render()` instead.
-           *Note that the CLI provides parallel rendering using a proper caller-side process pool with inline saving in rendering jobs.*
-        
-        .. versionchanged:: 4.25
-           Removed the original process pool implementation and turned this into a wrapper for linear rendering, due to the serious conceptual issues and possible memory load escalation, especially with expensive receiving code (e.g. PNG encoding) or long documents. See the changelog for more info
-        """
-        
-        warnings.warn("The document-level pdf.render() API is deprecated and uncored due to serious issues in the original concept. Use page.render() and a caller-side loop or process pool instead.", category=DeprecationWarning)
-        
-        if not page_indices:
-            page_indices = [i for i in range(len(self))]
-        for i in page_indices:
-            bitmap = renderer(self[i], **kwargs)
-            if pass_info:
-                yield (converter(bitmap), bitmap.get_info())
-            else:
-                yield converter(bitmap)
 
 
 class PdfFormEnv (pdfium_i.AutoCloseable):

From 677c4984eee092cf3bb7665a38a948bfe5227adc Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 4 Apr 2024 23:02:25 +0200
Subject: [PATCH 010/140] Work on `PdfImage.extract()`

---
 docs/devel/changelog_staging.md       |  4 ++-
 src/pypdfium2/_cli/extract_images.py  |  4 +--
 src/pypdfium2/_helpers/pageobjects.py | 39 +++++++++++----------------
 3 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 03efeb76b..f7389f560 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -6,9 +6,11 @@
 # Changelog for next release
 
 *API-breaking changes*
-- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also, distinguish between `dest == None` and an empty dest.
 - Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog).
+  Instead, use `PdfPage.render()` with a loop or process pool.
 - Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`.
+- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest.
+- Removed `fb_render` parameter from `PdfImage.extract()` because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place.
 
 *Improvements and new features*
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
diff --git a/src/pypdfium2/_cli/extract_images.py b/src/pypdfium2/_cli/extract_images.py
index 6fd94569e..6091d3489 100644
--- a/src/pypdfium2/_cli/extract_images.py
+++ b/src/pypdfium2/_cli/extract_images.py
@@ -37,7 +37,7 @@ def attach(parser):
     parser.add_argument(
         "--render",
         action = "store_true",
-        help = "Whether to get rendered bitmaps, taking masks and transform matrices into account. (Fallback if doing smart extraction.)",
+        help = "Whether to get rendered bitmaps, taking masks and transform matrices into account. (requires --use-bitmap, ignored with smart extraction)",
     )
 
 
@@ -71,7 +71,7 @@ def main(args):
                     pil_image = image.get_bitmap(render=args.render).to_pil()
                     pil_image.save( prefix.with_suffix("."+args.format) )
                 else:
-                    image.extract(prefix, fb_format=args.format, fb_render=args.render)
+                    image.extract(prefix, fb_format=args.format)
             except pdfium.PdfiumError:
                 traceback.print_exc()
             image.close()
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 2be708f1a..a9702f2d2 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -23,8 +23,7 @@ class PdfObject (pdfium_i.AutoCloseable):
     """
     Page object helper class.
     
-    When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead,
-    depending on the object's :attr:`.type` (e. g. :class:`.PdfImage`).
+    When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead, depending on the object's :attr:`.type` (e. g. :class:`.PdfImage`).
     
     Attributes:
         raw (FPDF_PAGEOBJECT):
@@ -121,15 +120,13 @@ def transform(self, matrix):
         pdfium_c.FPDFPageObj_Transform(self, *matrix.get())
 
 
-# In principle, we would like to move PdfImage to a separate file, but it's not that easy because of the two-fold connection with PdfObject, which would run us into a circular import. (However, what we could do is externalize the class under a different name and turn PdfImage into a wrapper which merely inherits from that class.)
-
 class PdfImage (PdfObject):
     """
     Image object helper class (specific kind of page object).
     """
     
     # cf. https://crbug.com/pdfium/1203
-    #: Filters applied by :func:`FPDFImageObj_GetImageDataDecoded`. Hereafter referred to as "simple filters", while non-simple filters will be called "complex filters".
+    #: Filters applied by :func:`FPDFImageObj_GetImageDataDecoded`, referred to as "simple filters". Other filters are considered "complex filters".
     SIMPLE_FILTERS = ("ASCIIHexDecode", "ASCII85Decode", "RunLengthDecode", "FlateDecode", "LZWDecode")
     
     
@@ -141,7 +138,7 @@ def new(cls, pdf):
         Returns:
             PdfImage: Handle to a new, empty image.
             Note that position and size of the image are defined by its matrix, which defaults to the identity matrix.
-            This means that new images will appear as a tiny square of 1x1 units on the bottom left corner of the page.
+            This means that new images will appear as a tiny square of 1x1 canvas units on the bottom left corner of the page.
             Use :class:`.PdfMatrix` and :meth:`.set_matrix` to adjust size and position.
         """
         raw_img = pdfium_c.FPDFPageObj_NewImageObj(pdf)
@@ -155,7 +152,7 @@ def get_metadata(self):
         
         Note:
             * The DPI values signify the resolution of the image on the PDF page, not the DPI metadata embedded in the image file.
-            * Due to issues in PDFium, this function can be slow. If you only need image size, prefer the faster :meth:`.get_size` instead.
+            * Due to issues in pdfium, this function might be slow on some kinds of images. If you only need size, prefer :meth:`.get_size` instead.
         
         Returns:
             FPDF_IMAGEOBJ_METADATA: Image metadata structure
@@ -170,8 +167,6 @@ def get_metadata(self):
     
     def get_size(self):
         """
-        .. versionadded:: 4.8/5731
-        
         Returns:
             (int, int): Image dimensions as a tuple of (width, height).
         """
@@ -310,23 +305,21 @@ def get_filters(self, skip_simple=False):
     
     
     def extract(self, dest, *args, **kwargs):
-        # TODO rewrite/simplify docstring
         """
-        Extract the image into an independently usable file or byte buffer.
-        Where possible within PDFium's limited public API, it will be attempted to transfer the image data directly,
-        avoiding an unnecessary layer of decoding and re-encoding.
-        Otherwise, the fully decoded data will be retrieved and (re-)encoded using :mod:`PIL`.
+        Extract the image into an independently usable file or byte buffer, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits.
         
-        As PDFium does not expose all required information, only DCTDecode (JPEG) and JPXDecode (JPEG 2000) images can be extracted directly.
-        For images with complex filters, the bitmap data is used. Otherwise, ``get_data(decode_simple=True)`` is used, which avoids lossy conversion for images whose bit depth or colour format is not supported by PDFium's bitmap implementation.
+        Only DCTDecode (JPEG) and JPXDecode (JPEG 2000) images can be extracted directly.
+        Otherwise, the pixel data is decoded, and re-encoded using :mod:`PIL`.
+        For images with simple filters only, ``get_data(decode_simple=True)`` is used for decoding to preserve higher bit depth or special color formats not supported by FPDF_BITMAP.
+        For images with complex filters, we have to resort to :meth:`.get_bitmap`, which can be a lossy operation.
+        
+        Note, this method ignores alpha masks and some other data stored separately from the main data stream (e.g. BlackIsWhite), which might lead to incorrect representation of the image.
         
         Parameters:
             dest (str | io.BytesIO):
                 File prefix or byte buffer to which the image shall be written.
             fb_format (str):
                 The image format to use in case it is necessary to (re-)encode the data.
-            fb_render (bool):
-                Whether the image should be rendered if falling back to bitmap-based extraction.
         """
         
         # https://crbug.com/pdfium/1930
@@ -367,15 +360,13 @@ def _get_pil_mode(colorspace, bpp):
         return None
 
 
-def _extract_smart(image_obj, fb_format=None, fb_render=False):
-    
-    # FIXME somewhat hard to read...
+def _extract_smart(image_obj, fb_format=None):
     
     try:
         data, info = _extract_direct(image_obj)
     except ImageNotExtractableError:
         # TODO? log reason why the image cannot be extracted directly
-        pil_image = image_obj.get_bitmap(render=fb_render).to_pil()
+        pil_image = image_obj.get_bitmap(render=False).to_pil()
     else:
         pil_image = None
         format = info.format
@@ -389,7 +380,9 @@ def _extract_smart(image_obj, fb_format=None, fb_render=False):
             )
     
     if pil_image:
-        format = fb_format if fb_format else "tiff" if pil_image.mode == "CMYK" else "png"
+        format = fb_format
+        if not format:
+            format = {"CMYK": "tiff"}.get(pil_image.mode, "png")
     
     buffer = yield format
     pil_image.save(buffer, format=format) if pil_image else buffer.write(data)

From 4de863d8099d686a844815d8b5b43fe8bcecadc5 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 00:02:27 +0200
Subject: [PATCH 011/140] Fix some object pointer checks against None

Use bool() rather than checking against None. See findings in get_toc():
"We need bool(ptr) here to handle cases where .contents is a null
pointer (raises exception on access). Don't use ptr != None, it's always
true."
---
 src/pypdfium2/_helpers/document.py    | 2 +-
 src/pypdfium2/_helpers/page.py        | 2 +-
 src/pypdfium2/_helpers/pageobjects.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 2cb8b7e3f..c518603b2 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -486,7 +486,7 @@ def page_as_xobject(self, index, dest_pdf):
             PdfXObject: The page as XObject.
         """
         raw_xobject = pdfium_c.FPDF_NewXObjectFromPage(dest_pdf, self, index)
-        if raw_xobject is None:
+        if not raw_xobject:
             raise PdfiumError(f"Failed to capture page at index {index} as FPDF_XOBJECT.")
         xobject = PdfXObject(raw=raw_xobject, pdf=dest_pdf)
         self._add_kid(xobject)
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 38c7ab4be..f536b6e5b 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -280,7 +280,7 @@ def get_objects(self, filter=None, max_depth=2, form=None, level=0):
         for i in range(n_objects):
             
             raw_obj = get_object(parent, i)
-            if raw_obj is None:
+            if not raw_obj:
                 raise PdfiumError("Failed to get page object.")
             
             helper_obj = PdfObject(raw_obj, page=self, pdf=self.pdf, level=level)
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index a9702f2d2..0d712fec3 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -258,7 +258,7 @@ def get_bitmap(self, render=False):
         else:
             raw_bitmap = pdfium_c.FPDFImageObj_GetBitmap(self)
         
-        if raw_bitmap is None:
+        if not raw_bitmap:
             raise PdfiumError(f"Failed to get bitmap of image {self}.")
         
         return PdfBitmap.from_raw(raw_bitmap)

From ccfe92358b44af604f09c9f9fc932826e09a0931 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 00:06:53 +0200
Subject: [PATCH 012/140] Address `run check` findings

---
 README.md                          | 2 +-
 docs/devel/changelog.md            | 4 ++--
 src/pypdfium2/_helpers/document.py | 5 -----
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index d052a55d1..adad42e55 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct
 * <a id="user-content-install-source" class="anchor" href="#install-source">From source 🔗</a>
   
   * Dependencies:
-    - System: git, C pre-processor (gcc/clang - alternatively, specify the command to envoke via `$CPP`)
+    - System: git, C pre-processor (gcc/clang - alternatively, specify the command to invoke via `$CPP`)
     - Python: ctypesgen (pypdfium2-team fork), wheel, setuptools. Usually installed automatically.
   
   * Get the code
diff --git a/docs/devel/changelog.md b/docs/devel/changelog.md
index efe398cd3..3c1c7b7d3 100644
--- a/docs/devel/changelog.md
+++ b/docs/devel/changelog.md
@@ -23,7 +23,7 @@
 ## 4.26.0 (2024-01-10)
 
 - Updated PDFium from `6164` to `6233`.
-- Pin ctypesgen in sdist to prevent reoccurrence of {issue}`264` / {issue}`286`. As a drawback, the pin is never committed, so the sdist is not simply reproducible at this time due to dependence on the latest commit hash of the ctypesgen fork at build time.
+- Pin ctypesgen in sdist to prevent re-occurrence of {issue}`264` / {issue}`286`. As a drawback, the pin is never committed, so the sdist is not simply reproducible at this time due to dependence on the latest commit hash of the ctypesgen fork at build time.
 - Wheel tags: Added back `manylinux2014` in addition to `manylinux_{glibc_ver}` to be on the safe side. Suspected relation to the above issues.
 
 
@@ -44,7 +44,7 @@
 
 - The parallel rendering API unfortunately was an inherent design mistake: Multiprocessing is not meant to transfer large amounts of pixel data from workers to the main process.
 - This was such a heavy drawback that it basically outweighed the parallelization, so there was no real performance advantage, only higher memory load.
-- As a related problem, the worker pool produces bitmaps at an indepedent speed, regardless of where the receiving iteration might be, so bitmaps could queue up in memory, possibly causing an enormeous rise in memory consumption over time. This effect was pronounced e.g. with PNG saving via PIL, as exhibited in Facebook's `nougat` project.
+- As a related problem, the worker pool produces bitmaps at an independent speed, regardless of where the receiving iteration might be, so bitmaps could queue up in memory, possibly causing an enormeous rise in memory consumption over time. This effect was pronounced e.g. with PNG saving via PIL, as exhibited in Facebook's `nougat` project.
 - Instead, each bitmap should be processed (e.g. saved) in the job which created it. Only a minimal, final result should be sent back to the main process (e.g. a file path).
 - This means we cannot reasonably provide a generic parallel renderer, instead it needs to be implemented by callers.
 - Historically, note that there had been even more faults in the implementation:
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index c518603b2..81eed78f3 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -3,14 +3,9 @@
 
 __all__ = ("PdfDocument", "PdfFormEnv", "PdfXObject", "PdfBookmark", "PdfDest")
 
-import os
 import ctypes
 import logging
-import inspect
-import warnings
 from pathlib import Path
-from collections import namedtuple
-import multiprocessing as mp
 
 import pypdfium2.raw as pdfium_c
 import pypdfium2.internal as pdfium_i

From 2360165e2cea18aae0856167e3549ba1bcf846d0 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 00:14:15 +0200
Subject: [PATCH 013/140] Expand constructor assignments

This is longer, but cleaner.
Imagine you have to edit it and assignment order gets wrong :P

BTW, normalize PdfFormEnv constructor param order.
---
 src/pypdfium2/_helpers/attachment.py  |  3 ++-
 src/pypdfium2/_helpers/document.py    | 18 ++++++++++++------
 src/pypdfium2/_helpers/matrix.py      |  3 ---
 src/pypdfium2/_helpers/page.py        |  4 +++-
 src/pypdfium2/_helpers/pageobjects.py |  5 ++++-
 5 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/pypdfium2/_helpers/attachment.py b/src/pypdfium2/_helpers/attachment.py
index 5b69e62b6..ef5f45457 100644
--- a/src/pypdfium2/_helpers/attachment.py
+++ b/src/pypdfium2/_helpers/attachment.py
@@ -36,7 +36,8 @@ class PdfAttachment (pdfium_i.AutoCastable):
     
     
     def __init__(self, raw, pdf):
-        self.raw, self.pdf = raw, pdf
+        self.raw = raw
+        self.pdf = pdf
     
     
     def get_name(self):
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 81eed78f3..75ef2ef90 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -172,7 +172,7 @@ def init_forms(self, config=None):
         raw = pdfium_c.FPDFDOC_InitFormFillEnvironment(self, config)
         if not raw:
             raise PdfiumError(f"Initializing form env failed for document {self}.")
-        self.formenv = PdfFormEnv(raw, config, self)
+        self.formenv = PdfFormEnv(raw, self, config)
         self._add_kid(self.formenv)
         
         if formtype in (pdfium_c.FORMTYPE_XFA_FULL, pdfium_c.FORMTYPE_XFA_FOREGROUND):
@@ -543,8 +543,10 @@ class PdfFormEnv (pdfium_i.AutoCloseable):
             Parent document this form env belongs to.
     """
     
-    def __init__(self, raw, config, pdf):
-        self.raw, self.config, self.pdf = raw, config, pdf
+    def __init__(self, raw, pdf, config):
+        self.raw = raw
+        self.pdf = pdf
+        self.config = config
         super().__init__(PdfFormEnv._close_impl, self.config, self.pdf)
     
     @property
@@ -568,7 +570,8 @@ class PdfXObject (pdfium_i.AutoCloseable):
     """
     
     def __init__(self, raw, pdf):
-        self.raw, self.pdf = raw, pdf
+        self.raw = raw
+        self.pdf = pdf
         super().__init__(pdfium_c.FPDF_CloseXObject)
     
     @property
@@ -629,7 +632,9 @@ class PdfBookmark (pdfium_i.AutoCastable):
     """
     
     def __init__(self, raw, pdf, level):
-        self.raw, self.pdf, self.level = raw, pdf, level
+        self.raw = raw
+        self.pdf = pdf
+        self.level = level
     
     def get_title(self):
         """
@@ -670,7 +675,8 @@ class PdfDest (pdfium_i.AutoCastable):
     """
     
     def __init__(self, raw, pdf):
-        self.raw, self.pdf = raw, pdf
+        self.raw = raw
+        self.pdf = pdf
     
     def get_index(self):
         """
diff --git a/src/pypdfium2/_helpers/matrix.py b/src/pypdfium2/_helpers/matrix.py
index 4d9aff402..a8cab515d 100644
--- a/src/pypdfium2/_helpers/matrix.py
+++ b/src/pypdfium2/_helpers/matrix.py
@@ -40,17 +40,14 @@ class PdfMatrix:
     def __init__(self, a=1, b=0, c=0, d=1, e=0, f=0):
         self.a, self.b, self.c, self.d, self.e, self.f = a, b, c, d, e, f
     
-    
     def __repr__(self):
         return f"PdfMatrix{self.get()}"
     
-    
     def __eq__(self, other):
         if type(self) is not type(other):
             return False
         return (self.get() == other.get())
     
-    
     @property
     def _as_parameter_(self):
         return ctypes.byref( self.to_raw() )
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index f536b6e5b..e0dd5d42c 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -27,7 +27,9 @@ class PdfPage (pdfium_i.AutoCloseable):
     """
     
     def __init__(self, raw, pdf, formenv):
-        self.raw, self.pdf, self.formenv = raw, pdf, formenv
+        self.raw = raw
+        self.pdf = pdf
+        self.formenv = formenv
         super().__init__(PdfPage._close_impl, self.formenv)
     
     
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 0d712fec3..e83bbf5c7 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -55,7 +55,10 @@ def __new__(cls, raw, *args, **kwargs):
     
     def __init__(self, raw, page=None, pdf=None, level=0):
         
-        self.raw, self.page, self.pdf, self.level = raw, page, pdf, level
+        self.raw = raw
+        self.page = page
+        self.pdf = pdf
+        self.level = level
         
         if page is not None:
             if self.pdf is None:

From c581f5af2614cbd74e3b1fbd785f1d9e7b6bee53 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 00:22:06 +0200
Subject: [PATCH 014/140] autorelease: add task

---
 setupsrc/pypdfium2_setup/autorelease.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setupsrc/pypdfium2_setup/autorelease.py b/setupsrc/pypdfium2_setup/autorelease.py
index 520f712c4..85d94639a 100644
--- a/setupsrc/pypdfium2_setup/autorelease.py
+++ b/setupsrc/pypdfium2_setup/autorelease.py
@@ -140,6 +140,7 @@ def make_releasenotes(summary, prev_pdfium, new_pdfium, prev_tag, new_tag, c_upd
     if c_updates:
         with tempfile.TemporaryDirectory() as tmpdir:
             tmpdir = Path(tmpdir)
+            # FIXME seems to take rather long - possibility to limit history size?
             run_cmd(["git", "clone", "--filter=blob:none", "--no-checkout", PdfiumURL, "pdfium_history"], cwd=tmpdir)
             relnotes += _get_log(
                 "PDFium", PdfiumURL, tmpdir/"pdfium_history",

From 81f2b4af50144ba75bf12ef06bd82ddbb5863534 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 00:26:31 +0200
Subject: [PATCH 015/140] slightly improve wording for v4.25 changelog

---
 docs/devel/changelog.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/devel/changelog.md b/docs/devel/changelog.md
index 3c1c7b7d3..f1e08d3a0 100644
--- a/docs/devel/changelog.md
+++ b/docs/devel/changelog.md
@@ -43,10 +43,10 @@
 #### Rationale for `PdfDocument.render()` deprecation
 
 - The parallel rendering API unfortunately was an inherent design mistake: Multiprocessing is not meant to transfer large amounts of pixel data from workers to the main process.
-- This was such a heavy drawback that it basically outweighed the parallelization, so there was no real performance advantage, only higher memory load.
-- As a related problem, the worker pool produces bitmaps at an independent speed, regardless of where the receiving iteration might be, so bitmaps could queue up in memory, possibly causing an enormeous rise in memory consumption over time. This effect was pronounced e.g. with PNG saving via PIL, as exhibited in Facebook's `nougat` project.
+- Bitmap transfer is so expensive that it essentially outweighed parallelization, so there was no real performance advantage, only higher memory load.
+- As a related problem, the worker pool produces bitmaps at an independent speed, regardless of where the receiving iteration might be, so bitmaps could queue up in memory, possibly causing an enormeous rise in memory consumption over time. This effect was pronounced e.g. with PNG saving via PIL, as seen in Facebook's `nougat` project.
 - Instead, each bitmap should be processed (e.g. saved) in the job which created it. Only a minimal, final result should be sent back to the main process (e.g. a file path).
-- This means we cannot reasonably provide a generic parallel renderer, instead it needs to be implemented by callers.
+- This means we cannot reasonably provide a generic parallel renderer; instead it needs to be implemented by callers.
 - Historically, note that there had been even more faults in the implementation:
   * Prior to `4.22.0`, the pool was always initialized with `os.cpu_count()` processes by default, even when rendering less pages.
   * Prior to `4.20.0`, a full-scale input transfer was conducted on each job (rendering it unusable with bytes input). However, this can and should be done only once on process creation.

From ddc3f3a036c57f6b653ac0119845862f852413f5 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 14:39:04 +0200
Subject: [PATCH 016/140] Remove deprecated version API

---
 docs/devel/changelog_staging.md |  3 ++-
 src/pypdfium2/version.py        | 25 ++++---------------------
 2 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index f7389f560..c2ddea672 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -10,7 +10,8 @@
   Instead, use `PdfPage.render()` with a loop or process pool.
 - Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`.
 - `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest.
-- Removed `fb_render` parameter from `PdfImage.extract()` because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place.
+- Removed `fb_render` param from `PdfImage.extract()` because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place.
+- Removed some deprecated members/params (e.g. legacy version flags, `recopy` of `PdfBitmap.from_pil()`)
 
 *Improvements and new features*
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py
index 7dd604a25..dfbab36af 100644
--- a/src/pypdfium2/version.py
+++ b/src/pypdfium2/version.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
-__all__ = []
+__all__ = ("PYPDFIUM_INFO", "PDFIUM_INFO")
 
 import sys
 import json
@@ -11,7 +11,8 @@
 import pypdfium2_raw
 
 
-# TODO move to shared compat file
+# TODO remove caching and just assign everything on init/lib startup
+
 if sys.version_info < (3, 8):
     def cached_property(func):
         return property( functools.lru_cache(maxsize=1)(func) )
@@ -116,29 +117,11 @@ def desc(self):
 # TODO(future) add bindings info (e.g. ctypesgen version, reference/generated, runtime libdirs)
 
 
-# Current API
+# API
 
 PYPDFIUM_INFO = _version_pypdfium2()
 PDFIUM_INFO = _version_pdfium()
 
-__all__ += ["PYPDFIUM_INFO", "PDFIUM_INFO"]
-
-# -----
-
-
-# Deprecated API, to be removed with v5
-# Known issue: causes eager evaluation of the new API's theoretically deferred properties.
-
-V_PYPDFIUM2 = PYPDFIUM_INFO.version
-V_LIBPDFIUM = str(PDFIUM_INFO.build)
-V_BUILDNAME = PDFIUM_INFO.origin
-V_PDFIUM_IS_V8 = "V8" in PDFIUM_INFO.flags  # implies XFA
-V_LIBPDFIUM_FULL = PDFIUM_INFO.version
-
-__all__ += ["V_PYPDFIUM2", "V_LIBPDFIUM", "V_LIBPDFIUM_FULL", "V_BUILDNAME", "V_PDFIUM_IS_V8"]
-
-# -----
-
 
 # Docs
 

From 3acc545704f7b24d3cc918d179005ec408dc035e Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 15:09:40 +0200
Subject: [PATCH 017/140] Simplify version impl

I figured the deferred API doesn't make much sense, because this should
compute quickly anyway. Caching is more for expensive properties.

Also, we have to access the pdfium info on init for XFA/V8 checks, so it
never would be truly deferred anyway.
---
 docs/devel/changelog_staging.md |  1 +
 src/pypdfium2/version.py        | 98 +++++++++++----------------------
 2 files changed, 32 insertions(+), 67 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index c2ddea672..ed89388cf 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -15,6 +15,7 @@
 
 *Improvements and new features*
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
+- Simplified version implementation (no API change expected). All attributes are now assigned and show up in `dir(...)`, instead of `__getattr__` magic.
 
 <!-- TODO
 See https://github.com/pypdfium2-team/pypdfium2/blob/devel_old/docs/devel/changelog_staging.md
diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py
index dfbab36af..78006c11b 100644
--- a/src/pypdfium2/version.py
+++ b/src/pypdfium2/version.py
@@ -1,68 +1,48 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
+# TODO(future) add bindings info (e.g. ctypesgen version, reference/generated, runtime libdirs)
+
 __all__ = ("PYPDFIUM_INFO", "PDFIUM_INFO")
 
-import sys
 import json
-import functools
 from pathlib import Path
-from types import MappingProxyType
 import pypdfium2_raw
 
 
-# TODO remove caching and just assign everything on init/lib startup
-
-if sys.version_info < (3, 8):
-    def cached_property(func):
-        return property( functools.lru_cache(maxsize=1)(func) )
-else:
-    cached_property = functools.cached_property
-
-
 class _abc_version:
     
-    @cached_property
-    def _data(self):
+    def __init__(self):
+        
         with open(self._FILE, "r") as buf:
             data = json.load(buf)
-        self._process_data(data)
-        return MappingProxyType(data)
-    
-    def _process_data(self, data):
-        pass
-    
-    def __getattr__(self, attr):
-        return self._data[attr]
-    
-    def __setattr__(self, name, value):
-        raise AttributeError(f"Version class is immutable - assignment '{name} = {value}' not allowed")
+        for k, v in data.items():
+            setattr(self, k, v)
+        self.api_tag = tuple(data[k] for k in self._TAG_FIELDS)
+        self._hook()
+        self.version = self.tag + self.desc
+        
+        def frozen_setattr(self, name, value):
+            raise AttributeError(f"Version class is immutable - assignment '{name} = {value}' not allowed")
+        self.__setattr__ = frozen_setattr
     
     def __repr__(self):
         return self.version
     
-    @cached_property
-    def api_tag(self):
-        return tuple(self._data[k] for k in self._TAG_FIELDS)
-    
     def _craft_tag(self):
         return ".".join(str(v) for v in self.api_tag)
     
-    def _craft_desc(self, extra=[]):
+    def _craft_desc(self, suffix=[]):
         
         local_ver = []
         if self.n_commits > 0:
             local_ver += [str(self.n_commits), str(self.hash)]
-        local_ver += extra
+        local_ver += suffix
         
         desc = ""
         if local_ver:
             desc += "+" + ".".join(local_ver)
         return desc
-    
-    @cached_property
-    def version(self):
-        return self.tag + self.desc
 
 
 class _version_pypdfium2 (_abc_version):
@@ -70,27 +50,18 @@ class _version_pypdfium2 (_abc_version):
     _FILE = Path(__file__).parent / "version.json"
     _TAG_FIELDS = ("major", "minor", "patch")
     
-    @cached_property
-    def tag(self):
-        tag = self._craft_tag()
+    def _hook(self):
+        
+        self.tag = self._craft_tag()
         if self.beta is not None:
-            tag += f"b{self.beta}"
-        return tag
+            self.tag += f"b{self.beta}"
     
-    @cached_property
-    def desc(self):
-        
-        extra = []
-        if self.dirty:
-            extra += ["dirty"]
-        
-        desc = self._craft_desc(extra)
+        suffix = ["dirty"] if self.dirty else []
+        self.desc = self._craft_desc(suffix)
         if self.data_source != "git":
-            desc += f":{self.data_source}"
+            self.desc += f":{self.data_source}"
         if self.is_editable:
-            desc += "@editable"
-        
-        return desc
+            self.desc += "@editable"
 
 
 class _version_pdfium (_abc_version):
@@ -98,23 +69,16 @@ class _version_pdfium (_abc_version):
     _FILE = Path(pypdfium2_raw.__file__).parent / "version.json"
     _TAG_FIELDS = ("major", "minor", "build", "patch")
     
-    def _process_data(self, data):
-        data["flags"] = tuple(data["flags"])
-    
-    @cached_property
-    def tag(self):
-        return self._craft_tag()
-    
-    @cached_property
-    def desc(self):
-        desc = self._craft_desc()
+    def _hook(self):
+        
+        self.flags = tuple(self.flags)
+        self.tag = self._craft_tag()
+        
+        self.desc = self._craft_desc()
         if self.flags:
-            desc += ":{%s}" % ",".join(self.flags)
+            self.desc += ":{%s}" % ",".join(self.flags)
         if self.origin != "pdfium-binaries":
-            desc += f"@{self.origin}"
-        return desc
-
-# TODO(future) add bindings info (e.g. ctypesgen version, reference/generated, runtime libdirs)
+            self.desc += f"@{self.origin}"
 
 
 # API

From 9d87ae075dd4402581c0e3b39b59ee54432457e7 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 20:09:41 +0200
Subject: [PATCH 018/140] readme: remove python 3.7.6/3.8.1 incompat

pypdfium2 is now always built with our fork of ctypesgen, so this is no
longer an issue.
---
 README.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/README.md b/README.md
index adad42e55..d611f769f 100644
--- a/README.md
+++ b/README.md
@@ -667,12 +667,6 @@ Roadmap:
 
 ### Known limitations
 
-#### Incompatibility with CPython 3.7.6 and 3.8.1
-
-pypdfium2 built with mainstream ctypesgen cannot be used with releases 3.7.6 and 3.8.1 of the CPython interpreter due to a [regression](https://github.com/python/cpython/pull/16799#issuecomment-612353119) that [broke](https://github.com/ctypesgen/ctypesgen/issues/77) ctypesgen-created string handling code.
-
-Since version 4, pypdfium2 is built with a patched fork of ctypesgen that removes ctypesgen's problematic string code.
-
 #### Risk of unknown object lifetime violations
 
 As outlined in the raw API section, it is essential that Python-managed resources remain available as long as they are needed by PDFium.

From df24061dc18a7678eaaabab7be98413df7d63e70 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 20:39:32 +0200
Subject: [PATCH 019/140] Remove color scheme from rendering

(see changelog for rationale)
---
 docs/devel/changelog_staging.md |  1 +
 src/pypdfium2/_cli/render.py    | 54 ++-------------------------------
 src/pypdfium2/_helpers/page.py  | 43 ++------------------------
 tests_old/test_renderer.py      | 22 +-------------
 4 files changed, 7 insertions(+), 113 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index ed89388cf..79bbe0e84 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -9,6 +9,7 @@
 - Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog).
   Instead, use `PdfPage.render()` with a loop or process pool.
 - Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`.
+- Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark them" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion.
 - `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest.
 - Removed `fb_render` param from `PdfImage.extract()` because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place.
 - Removed some deprecated members/params (e.g. legacy version flags, `recopy` of `PdfBitmap.from_pil()`)
diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 211cc5cb2..7c400b791 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -43,15 +43,6 @@ def _bitmap_wrapper_foreign_simple(width, height, format, *args, **kwargs):
     foreign_simple = _bitmap_wrapper_foreign_simple,
 )
 
-CsFields = ("path_fill", "path_stroke", "text_fill", "text_stroke")
-ColorOpts = dict(metavar="C", nargs=4, type=int)
-SampleTheme = dict(
-    # TODO improve colors - currently it's just some random ones to distinguish the different drawings
-    path_fill   = (170, 100, 0,   255),  # dark orange
-    path_stroke = (0,   150, 255, 255),  # sky blue
-    text_fill   = (255, 255, 255, 255),  # white
-    text_stroke = (150, 255, 0,   255),  # green
-)
 
 def attach(parser):
     add_input(parser, pages=True)
@@ -92,8 +83,8 @@ def attach(parser):
     )
     parser.add_argument(
         "--fill-color",
-        help = "Color the bitmap will be filled with before rendering. It shall be given in RGBA format as a sequence of integers ranging from 0 to 255. Defaults to white.",
-        **ColorOpts,
+        metavar="C", nargs=4, type=int,
+        help = "Color the bitmap will be filled with before rendering. Shall be given in RGBA format as a sequence of integers ranging from 0 to 255. Defaults to white.",
     )
     parser.add_argument(
         "--optimize-mode",
@@ -198,37 +189,6 @@ def attach(parser):
         type = str.lower,
         help = "The map function to use (backend specific, the default is an iterative map)."
     )
-    
-    color_scheme = parser.add_argument_group(
-        title = "Forced color scheme",
-        description = "Options for using pdfium's forced color scheme renderer. Deprecated, considered not useful.",
-    )
-    color_scheme.add_argument(
-        "--sample-theme",
-        action = "store_true",
-        help = "Use a dark background sample theme as base. Explicit color params override selectively."
-    )
-    color_scheme.add_argument(
-        "--path-fill",
-        **ColorOpts
-    )
-    color_scheme.add_argument(
-        "--path-stroke",
-        **ColorOpts
-    )
-    color_scheme.add_argument(
-        "--text-fill",
-        **ColorOpts
-    )
-    color_scheme.add_argument(
-        "--text-stroke",
-        **ColorOpts
-    )
-    color_scheme.add_argument(
-        "--fill-to-stroke",
-        action = "store_true",
-        help = "Only draw borders around fill areas using the `path_stroke` color, instead of filling with the `path_fill` color.",
-    )
 
 
 class SavingEngine:
@@ -296,7 +256,7 @@ def main(args):
     if args.prefix is None:
         args.prefix = f"{args.input.stem}_"
     if args.fill_color is None:
-        args.fill_color = (0, 0, 0, 255) if args.sample_theme else (255, 255, 255, 255)
+        args.fill_color = (255, 255, 255, 255)
     if args.linear is None:
         args.linear = 6 if args.format == "jpg" else 3
     
@@ -314,20 +274,12 @@ def main(args):
         # PIL can't save BGRX as PNG
         args.prefer_bgrx = args.engine_cls is PILEngine and args.format != "png"
     
-    cs_kwargs = dict()
-    if args.sample_theme:
-        cs_kwargs.update(**SampleTheme)
-    cs_kwargs.update(**{f: getattr(args, f) for f in CsFields if getattr(args, f)})
-    cs = pdfium.PdfColorScheme(**cs_kwargs) if len(cs_kwargs) > 0 else None
-    
     kwargs = dict(
         scale = args.scale,
         rotation = args.rotation,
         crop = args.crop,
         grayscale = args.grayscale,
         fill_color = args.fill_color,
-        color_scheme = cs,
-        fill_to_stroke = args.fill_to_stroke,
         optimize_mode = args.optimize_mode,
         draw_annots = args.draw_annots,
         may_draw_forms = args.draw_forms,
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index e0dd5d42c..fff394f4f 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
-__all__ = ("PdfPage", "PdfColorScheme")
+__all__ = ("PdfPage", )
 
 import math
 import ctypes
@@ -328,7 +328,6 @@ def render(
             crop = (0, 0, 0, 0),
             may_draw_forms = True,
             bitmap_maker = PdfBitmap.new_native,
-            color_scheme = None,
             fill_to_stroke = False,
             **kwargs
         ):
@@ -353,9 +352,6 @@ def render(
             bitmap_maker (typing.Callable):
                 Callback function used to create the :class:`.PdfBitmap`.
                 
-            color_scheme (PdfColorScheme | None):
-                An optional, custom rendering color scheme.
-                
             fill_to_stroke (bool):
                 If True and rendering with custom color scheme, fill paths will be stroked.
                 
@@ -418,25 +414,13 @@ def render(
             raise ValueError("Crop exceeds page dimensions")
         
         cl_format, rev_byteorder, fill_color, flags = _parse_renderopts(**kwargs)
-        if (color_scheme is not None) and fill_to_stroke:
-            flags |= pdfium_c.FPDF_CONVERT_FILL_TO_STROKE
         
         bitmap = bitmap_maker(width, height, format=cl_format, rev_byteorder=rev_byteorder)
         bitmap.fill_rect(0, 0, width, height, fill_color)
         
         render_args = (bitmap, self, -crop[0], -crop[3], src_width, src_height, pdfium_i.RotationToConst[rotation], flags)
         
-        if color_scheme is None:
-            pdfium_c.FPDF_RenderPageBitmap(*render_args)
-        else:
-            
-            pause = pdfium_c.IFSDK_PAUSE(version=1)
-            pdfium_i.set_callback(pause, "NeedToPauseNow", lambda _: False)
-            
-            fpdf_cs = color_scheme.convert(rev_byteorder)
-            status = pdfium_c.FPDF_RenderPageBitmapWithColorScheme_Start(*render_args, fpdf_cs, pause)
-            assert status == pdfium_c.FPDF_RENDER_DONE
-            pdfium_c.FPDF_RenderPage_Close(self)
+        pdfium_c.FPDF_RenderPageBitmap(*render_args)
         
         if may_draw_forms and self.formenv:
             pdfium_c.FPDF_FFLDraw(self.formenv, *render_args)
@@ -510,26 +494,3 @@ def _parse_renderopts(
     
     # TODO consider using a namedtuple or something
     return cl_format, rev_byteorder, fill_color, flags
-
-
-class PdfColorScheme:
-    """
-    Rendering color scheme.
-    Each color shall be provided as a list of values for red, green, blue and alpha, ranging from 0 to 255.
-    """
-    
-    def __init__(self, path_fill, path_stroke, text_fill, text_stroke):
-        self.colors = dict(
-            path_fill_color=path_fill, path_stroke_color=path_stroke,
-            text_fill_color=text_fill, text_stroke_color=text_stroke,
-        )
-    
-    def convert(self, rev_byteorder):
-        """
-        Returns:
-            The color scheme as :class:`FPDF_COLORSCHEME` object.
-        """
-        fpdf_cs = pdfium_c.FPDF_COLORSCHEME()
-        for key, value in self.colors.items():
-            setattr(fpdf_cs, key, pdfium_i.color_tohex(value, rev_byteorder))
-        return fpdf_cs
diff --git a/tests_old/test_renderer.py b/tests_old/test_renderer.py
index 288b1e6fc..b5c79c347 100644
--- a/tests_old/test_renderer.py
+++ b/tests_old/test_renderer.py
@@ -171,27 +171,7 @@ def test_render_page_fill_color(fill_color, sample_page):
     assert bg_pixel == fill_color
 
 
-def test_render_page_colorscheme():
-    pdf = pdfium.PdfDocument(TestFiles.text)
-    page = pdf[0]
-    color_scheme = pdfium.PdfColorScheme(
-        path_fill   = (15,  15,  15,  255),
-        path_stroke = (255, 255, 255, 255),
-        text_fill   = (255, 255, 255, 255),
-        text_stroke = (255, 255, 255, 255),
-    )
-    image = page.render(
-        grayscale = True,
-        fill_color = (0, 0, 0, 255),
-        color_scheme = color_scheme,
-    ).to_pil()
-    assert image.mode == "L"
-    image.save(OutputDir / "render_colorscheme.png")
-
-
-@pytest.mark.parametrize(
-    "rev_byteorder", [False, True]
-)
+@pytest.mark.parametrize("rev_byteorder", [False, True])
 def test_render_page_tonumpy(rev_byteorder, sample_page):
     
     bitmap = sample_page.render(

From b563200f7148cb33cccd1e0e8c666d36df916cf6 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 21:19:46 +0200
Subject: [PATCH 020/140] Backport get_quad_points() from devel_old

+ tests
---
 src/pypdfium2/_helpers/pageobjects.py | 24 +++++++++++++++
 tests_old/test_pageobject.py          | 43 ++++++++++++++++++++-------
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index e83bbf5c7..6a7fb5bec 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -93,6 +93,30 @@ def get_pos(self):
         return (l.value, b.value, r.value, t.value)
     
     
+    def get_quad_points(self):
+        """
+        Get the object's quadriliteral points (i.e. the positions of its corners).
+        For transformed objects, this provides tighter bounds than a rectangle (e.g. rotation by a non-multiple of 90°, shear).
+        
+        Note:
+            This function only supports image and text objects.
+        
+        Returns:
+            tuple[tuple[float*2] * 4]: Corner positions as (x, y) tuples, counter-clockwise from origin, i.e. bottom-left, bottom-right, top-right, top-left, in PDF page coordinates.
+        """
+        
+        if self.type not in (pdfium_c.FPDF_PAGEOBJ_IMAGE, pdfium_c.FPDF_PAGEOBJ_TEXT):
+            # as of pdfium 5921
+            raise RuntimeError("Quad points only supported for image and text.")
+        
+        q = pdfium_c.FS_QUADPOINTSF()
+        ok = pdfium_c.FPDFPageObj_GetRotatedBounds(self, q)
+        if not ok:
+            raise PdfiumError("Failed to get quad points.")
+        
+        return (q.x1, q.y1), (q.x2, q.y2), (q.x3, q.y3), (q.x4, q.y4)
+    
+    
     def get_matrix(self):
         """
         Returns:
diff --git a/tests_old/test_pageobject.py b/tests_old/test_pageobject.py
index afa2a3396..ab0ed0a06 100644
--- a/tests_old/test_pageobject.py
+++ b/tests_old/test_pageobject.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
 import io
+import re
 import pytest
 import PIL.Image
 import pypdfium2 as pdfium
@@ -9,6 +10,12 @@
 from .conftest import TestFiles, OutputDir
 
 
+def compare_n2(data, exp_data):
+    assert len(data) == len(exp_data)
+    for d, exp_d in zip(data, exp_data):
+        assert pytest.approx(d, abs=1) == exp_d
+
+
 def test_image_objects():
     pdf = pdfium.PdfDocument(TestFiles.images)
     page = pdf[0]
@@ -17,14 +24,14 @@ def test_image_objects():
     images = list( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
     assert len(images) == 3
     
-    obj = images[0]
-    assert isinstance(obj, pdfium.PdfObject)
-    assert type(obj) is pdfium.PdfImage
-    assert obj.type == pdfium_c.FPDF_PAGEOBJ_IMAGE
-    assert isinstance(obj.raw, pdfium_c.FPDF_PAGEOBJECT)
-    assert obj.level == 0
-    assert obj.page is page
-    assert obj.pdf is pdf
+    img_0 = images[0]
+    assert isinstance(img_0, pdfium.PdfObject)
+    assert type(img_0) is pdfium.PdfImage
+    assert img_0.type == pdfium_c.FPDF_PAGEOBJ_IMAGE
+    assert isinstance(img_0.raw, pdfium_c.FPDF_PAGEOBJECT)
+    assert img_0.level == 0
+    assert img_0.page is page
+    assert img_0.pdf is pdf
     
     positions = [img.get_pos() for img in images]
     exp_positions = [
@@ -32,9 +39,12 @@ def test_image_objects():
         (48, 652, 163, 700),
         (204, 204, 577, 360),
     ]
-    assert len(positions) == len(exp_positions)
-    for pos, exp_pos in zip(positions, exp_positions):
-        assert pytest.approx(pos, abs=1) == exp_pos
+    compare_n2(positions, exp_positions)
+    
+    compare_n2(
+        img_0.get_quad_points(),
+        ((132.7, 459.2), (349.5, 459.2), (349.5, 549.7), (132.7, 549.7))
+    )
 
 
 def test_misc_objects():
@@ -52,6 +62,17 @@ def test_misc_objects():
         assert obj.pdf is pdf
         pos = obj.get_pos()
         assert len(pos) == 4
+    
+    text_obj = next(obj for obj in page.get_objects() if obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT)
+    path_obj = next(obj for obj in page.get_objects() if obj.type == pdfium_c.FPDF_PAGEOBJ_PATH)
+    
+    compare_n2(
+        text_obj.get_quad_points(),
+        ((57.3, 767.4), (124.2, 767.4), (124.2, 780.9), (57.3, 780.9))
+    )
+    
+    with pytest.raises(RuntimeError, match=re.escape("Quad points only supported for image and text.")):
+        path_obj.get_quad_points()
 
 
 def test_new_image_from_jpeg():

From 30996b39004ea6dd82af675b28adb3f2b80464d3 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 21:33:55 +0200
Subject: [PATCH 021/140] Apply renamings, update pageobjects CLI

get_size() -> get_px_size()
get_pos() -> get_bounds()
---
 README.md                                 |  4 +-
 src/pypdfium2/_cli/imgtopdf.py            |  2 +-
 src/pypdfium2/_cli/pageobjects.py         | 69 +++++++++++++----------
 src/pypdfium2/_helpers/pageobjects.py     | 10 ++--
 tests/expectations/pageobjects_images.txt | 22 +++++---
 tests/test_nup.py                         |  2 +-
 tests_old/test_pageobject.py              | 10 ++--
 7 files changed, 65 insertions(+), 54 deletions(-)

diff --git a/README.md b/README.md
index d611f769f..85300ad2a 100644
--- a/README.md
+++ b/README.md
@@ -254,7 +254,7 @@ Here are some examples of using the support model API.
   
   # Locate objects on the page
   for obj in page.get_objects():
-      print(obj.level, obj.type, obj.get_pos())
+      print(obj.level, obj.type, obj.get_bounds())
   ```
 
 * Extract and search text
@@ -300,7 +300,7 @@ Here are some examples of using the support model API.
   
   image = pdfium.PdfImage.new(pdf)
   image.load_jpeg("./tests/resources/mona_lisa.jpg")
-  width, height = image.get_size()
+  width, height = image.get_px_size()
   
   matrix = pdfium.PdfMatrix().scale(width, height)
   image.set_matrix(matrix)
diff --git a/src/pypdfium2/_cli/imgtopdf.py b/src/pypdfium2/_cli/imgtopdf.py
index 7fcf83cd5..542d637e2 100644
--- a/src/pypdfium2/_cli/imgtopdf.py
+++ b/src/pypdfium2/_cli/imgtopdf.py
@@ -56,7 +56,7 @@ def main(args):
             image_obj.set_bitmap(bitmap)
             bitmap.close()
         
-        w, h = image_obj.get_size()
+        w, h = image_obj.get_px_size()
         image_obj.set_matrix( pdfium.PdfMatrix().scale(w, h) )
         page = pdf.new_page(w, h)
         page.insert_obj(image_obj)
diff --git a/src/pypdfium2/_cli/pageobjects.py b/src/pypdfium2/_cli/pageobjects.py
index 9e9aee531..dcfccb011 100644
--- a/src/pypdfium2/_cli/pageobjects.py
+++ b/src/pypdfium2/_cli/pageobjects.py
@@ -1,13 +1,13 @@
-# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
+# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
 # TODO test-confirm filter and info params
 
-from enum import Enum
+from collections import OrderedDict
 import pypdfium2._helpers as pdfium
-import pypdfium2.raw as pdfium_c
 import pypdfium2.internal as pdfium_i
-# TODO? consider dotted access
+import pypdfium2.raw as pdfium_r
+# CONSIDER dotted access
 from pypdfium2._cli._parsers import (
     add_input,
     add_n_digits,
@@ -16,9 +16,9 @@
 )
 
 
-class InfoParams (Enum):
-    pos = 0
-    imageinfo = 1
+PARAM_POS = "pos"
+PARAM_IMGINFO = "imginfo"
+INFO_PARAMS = (PARAM_POS, PARAM_IMGINFO)
 
 
 def attach(parser):
@@ -44,20 +44,28 @@ def attach(parser):
     parser.add_argument(
         "--info",
         nargs = "*",
-        type = lambda s: InfoParams[s.lower()],
-        default = (InfoParams.pos, InfoParams.imageinfo),
-        help = "Object details to show (pos, imageinfo).",
+        type = str.lower,
+        choices = INFO_PARAMS,
+        default = INFO_PARAMS,
+        help = "Object details to show.",
     )
 
 
-def print_img_metadata(metadata, pad=""):
-    for attr in pdfium_c.FPDF_IMAGEOBJ_METADATA.__slots__:
-        value = getattr(metadata, attr)
-        if attr == "colorspace":
-            value = pdfium_i.ColorspaceToStr.get(value)
-        elif attr == "marked_content_id" and value == -1:
-            continue
-        print(pad + f"{attr}: {value}\n", end="")
+def print_img_metadata(m, n_digits, pad=""):
+    
+    members = OrderedDict(
+        width = m.width,
+        height = m.height,
+        horizontal_dpi = round(m.horizontal_dpi, n_digits),
+        vertical_dpi = round(m.vertical_dpi, n_digits),
+        bits_per_pixel = m.bits_per_pixel,
+        colorspace = pdfium_i.ColorspaceToStr.get(m.colorspace),
+    )
+    if m.marked_content_id != -1:
+        members["marked_content_id"] = m.marked_content_id
+    
+    for key, value in members.items():
+        print(pad + f"{key}: {value}")
 
 
 def main(args):
@@ -68,38 +76,37 @@ def main(args):
     if args.filter:
         args.filter = [pdfium_i.ObjectTypeToConst[t] for t in args.filter]
     
-    show_pos = (InfoParams.pos in args.info)
-    show_imageinfo = (InfoParams.imageinfo in args.info)
+    show_pos = (PARAM_POS in args.info)
+    show_imageinfo = (PARAM_IMGINFO in args.info)
     total_count = 0
     
     for i in args.pages:
         
         page = pdf[i]
-        obj_searcher = page.get_objects(
-            filter = args.filter,
-            max_depth = args.max_depth,
-        )
-        preamble = f"# Page {i+1}\n"
+        obj_searcher = page.get_objects(args.filter, max_depth=args.max_depth)
+        print(f"# Page {i+1}")
         count = 0
         
         for obj in obj_searcher:
             
             pad_0 = "    " * obj.level
             pad_1 = pad_0 + "    "
-            print(preamble + pad_0 + pdfium_i.ObjectTypeToStr.get(obj.type))
+            print(pad_0 + pdfium_i.ObjectTypeToStr.get(obj.type))
             
             if show_pos:
-                pos = round_list(obj.get_pos(), args.n_digits)
-                print(pad_1 + f"Position: {pos}")
+                bounds = round_list(obj.get_bounds(), args.n_digits)
+                print(pad_1 + f"Bounding Box: {bounds}")
+                if obj.type in (pdfium_r.FPDF_PAGEOBJ_IMAGE, pdfium_r.FPDF_PAGEOBJ_TEXT):
+                    quad_bounds = obj.get_quad_points()
+                    print(pad_1 + f"Quad Points: {[round_list(p, args.n_digits) for p in quad_bounds]}")
             
-            # TODO? also call get_size() for coverage
             if show_imageinfo and isinstance(obj, pdfium.PdfImage):
                 print(pad_1 + f"Filters: {obj.get_filters()}")
                 metadata = obj.get_metadata()
-                print_img_metadata(metadata, pad=pad_1)
+                assert (metadata.width, metadata.height) == obj.get_px_size()
+                print_img_metadata(metadata, args.n_digits, pad=pad_1)
             
             count += 1
-            preamble = ""
         
         if count > 0:
             print(f"-> Count: {count}\n")
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 6a7fb5bec..fb846d318 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -75,7 +75,7 @@ def parent(self):  # AutoCloseable hook
         return self.pdf if self.page is None else self.page
     
     
-    def get_pos(self):
+    def get_bounds(self):
         """
         Get the position of the object on the page.
         
@@ -83,7 +83,7 @@ def get_pos(self):
             A tuple of four :class:`float` coordinates for left, bottom, right, and top.
         """
         if self.page is None:
-            raise RuntimeError("Must not call get_pos() on a loose pageobject.")
+            raise RuntimeError("Must not call get_bounds() on a loose pageobject.")
         
         l, b, r, t = c_float(), c_float(), c_float(), c_float()
         ok = pdfium_c.FPDFPageObj_GetBounds(self, l, b, r, t)
@@ -107,7 +107,7 @@ def get_quad_points(self):
         
         if self.type not in (pdfium_c.FPDF_PAGEOBJ_IMAGE, pdfium_c.FPDF_PAGEOBJ_TEXT):
             # as of pdfium 5921
-            raise RuntimeError("Quad points only supported for image and text.")
+            raise RuntimeError("Quad points only supported for image and text objects.")
         
         q = pdfium_c.FS_QUADPOINTSF()
         ok = pdfium_c.FPDFPageObj_GetRotatedBounds(self, q)
@@ -179,7 +179,7 @@ def get_metadata(self):
         
         Note:
             * The DPI values signify the resolution of the image on the PDF page, not the DPI metadata embedded in the image file.
-            * Due to issues in pdfium, this function might be slow on some kinds of images. If you only need size, prefer :meth:`.get_size` instead.
+            * Due to issues in pdfium, this function might be slow on some kinds of images. If you only need size, prefer :meth:`.get_px_size` instead.
         
         Returns:
             FPDF_IMAGEOBJ_METADATA: Image metadata structure
@@ -192,7 +192,7 @@ def get_metadata(self):
         return metadata
     
     
-    def get_size(self):
+    def get_px_size(self):
         """
         Returns:
             (int, int): Image dimensions as a tuple of (width, height).
diff --git a/tests/expectations/pageobjects_images.txt b/tests/expectations/pageobjects_images.txt
index 7888c0542..e67029267 100644
--- a/tests/expectations/pageobjects_images.txt
+++ b/tests/expectations/pageobjects_images.txt
@@ -1,33 +1,37 @@
 # Page 1
 text
-    Position: (58.692, 759.975, 127.024, 779.335)
+    Bounding Box: (58.692, 759.975, 127.024, 779.335)
+    Quad Points: [(58.692, 759.975), (127.024, 759.975), (127.024, 779.335), (58.692, 779.335)]
 image
-    Position: (132.7, 459.189, 349.5, 549.689)
+    Bounding Box: (132.7, 459.189, 349.5, 549.689)
+    Quad Points: [(132.7, 459.189), (349.5, 459.189), (349.5, 549.689), (132.7, 549.689)]
     Filters: ['CCITTFaxDecode']
     width: 115
     height: 48
-    horizontal_dpi: 38.19187927246094
-    vertical_dpi: 38.18785858154297
+    horizontal_dpi: 38.1919
+    vertical_dpi: 38.1879
     bits_per_pixel: 1
     colorspace: DeviceGray
     marked_content_id: 1
 image
-    Position: (47.65, 652.239, 162.6, 700.239)
+    Bounding Box: (47.65, 652.239, 162.6, 700.239)
+    Quad Points: [(47.65, 652.239), (162.6, 652.239), (162.6, 700.239), (47.65, 700.239)]
     Filters: ['CCITTFaxDecode']
     width: 115
     height: 48
-    horizontal_dpi: 72.03131103515625
+    horizontal_dpi: 72.0313
     vertical_dpi: 72.0
     bits_per_pixel: 1
     colorspace: DeviceGray
     marked_content_id: 2
 image
-    Position: (203.55, 204.089, 577.2, 360.039)
+    Bounding Box: (203.55, 204.089, 577.2, 360.039)
+    Quad Points: [(203.55, 204.089), (577.2, 204.089), (577.2, 360.039), (203.55, 360.039)]
     Filters: ['CCITTFaxDecode']
     width: 115
     height: 48
-    horizontal_dpi: 22.159772872924805
-    vertical_dpi: 22.16094970703125
+    horizontal_dpi: 22.1598
+    vertical_dpi: 22.1609
     bits_per_pixel: 1
     colorspace: DeviceGray
     marked_content_id: 3
diff --git a/tests/test_nup.py b/tests/test_nup.py
index 0d505cc8e..1d1f43bcb 100644
--- a/tests/test_nup.py
+++ b/tests/test_nup.py
@@ -32,7 +32,7 @@ def test_xobject_placement():
     dest_page_1.insert_obj(po)
     assert po.pdf is dest_pdf
     assert po.page is dest_page_1
-    pos_a = po.get_pos()
+    pos_a = po.get_bounds()
     # xfail with pdfium < 5370, https://crbug.com/pdfium/1905
     assert pytest.approx(pos_a, abs=0.5) == (19, 440, 279, 823)
     
diff --git a/tests_old/test_pageobject.py b/tests_old/test_pageobject.py
index ab0ed0a06..b7e8451f3 100644
--- a/tests_old/test_pageobject.py
+++ b/tests_old/test_pageobject.py
@@ -33,7 +33,7 @@ def test_image_objects():
     assert img_0.page is page
     assert img_0.pdf is pdf
     
-    positions = [img.get_pos() for img in images]
+    positions = [img.get_bounds() for img in images]
     exp_positions = [
         (133, 459, 350, 550),
         (48, 652, 163, 700),
@@ -60,7 +60,7 @@ def test_misc_objects():
         assert obj.level == 0
         assert obj.page is page
         assert obj.pdf is pdf
-        pos = obj.get_pos()
+        pos = obj.get_bounds()
         assert len(pos) == 4
     
     text_obj = next(obj for obj in page.get_objects() if obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT)
@@ -71,7 +71,7 @@ def test_misc_objects():
         ((57.3, 767.4), (124.2, 767.4), (124.2, 780.9), (57.3, 780.9))
     )
     
-    with pytest.raises(RuntimeError, match=re.escape("Quad points only supported for image and text.")):
+    with pytest.raises(RuntimeError, match=re.escape("Quad points only supported for image and text objects.")):
         path_obj.get_quad_points()
 
 
@@ -83,7 +83,7 @@ def test_new_image_from_jpeg():
     image_a = pdfium.PdfImage.new(pdf)
     buffer = open(TestFiles.mona_lisa, "rb")
     image_a.load_jpeg(buffer, autoclose=True)
-    width, height = image_a.get_size()
+    width, height = image_a.get_px_size()
     page.insert_obj(image_a)
     
     assert len(pdf._data_holder) == 1
@@ -177,7 +177,7 @@ def test_replace_image_with_jpeg():
     image_1 = images[0]
     
     image_1.load_jpeg(TestFiles.mona_lisa, pages=[page])
-    width, height = image_1.get_size()
+    width, height = image_1.get_px_size()
     assert matrices == [img.get_matrix() for img in images]
     
     # preserve the aspect ratio

From 02e23cfd333de324572b3b514d930c572e005977 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 21:43:40 +0200
Subject: [PATCH 022/140] Update changelog

---
 docs/devel/changelog_staging.md | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 79bbe0e84..352bf87f1 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -6,16 +6,22 @@
 # Changelog for next release
 
 *API-breaking changes*
-- Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog).
-  Instead, use `PdfPage.render()` with a loop or process pool.
-- Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`.
-- Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark them" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion.
+- Rendering / Bitmap
+  * Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). Instead, use `PdfPage.render()` with a loop or process pool.
+  * Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`.
+  * `PdfBitmap.from_pil()`: Removed `recopy` param.
+  * Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark them" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion.
+- Pageobjects
+  * Renamed `PdfObject.get_pos()` to `.get_bounds()`.
+  * Renamed `PdfImage.get_size()` to `.get_px_size()`.
+  * `PdfImage.extract()`: Removed `fb_render` param because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place.
 - `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest.
-- Removed `fb_render` param from `PdfImage.extract()` because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place.
-- Removed some deprecated members/params (e.g. legacy version flags, `recopy` of `PdfBitmap.from_pil()`)
+- Removed legacy version flags.
 
 *Improvements and new features*
+- Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
+- Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
 - Simplified version implementation (no API change expected). All attributes are now assigned and show up in `dir(...)`, instead of `__getattr__` magic.
 
 <!-- TODO

From 55e969c074583786907a9ae9dc93f58e6fd199af Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 5 Apr 2024 21:51:32 +0200
Subject: [PATCH 023/140] Update readme examples

---
 README.md                             | 35 ++++++++++++++++++---------
 docs/devel/changelog_staging.md       |  2 +-
 src/pypdfium2/_cli/pageobjects.py     |  7 +++---
 src/pypdfium2/_helpers/pageobjects.py |  4 +--
 4 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 85300ad2a..caeeecfc2 100644
--- a/README.md
+++ b/README.md
@@ -263,28 +263,39 @@ Here are some examples of using the support model API.
   textpage = page.get_textpage()
   
   # Extract text from the whole page
-  text_all = textpage.get_text_range()
+  text_all = textpage.get_text_bounded()
   # Extract text from a specific rectangular area
-  text_part = textpage.get_text_bounded(left=50, bottom=100, right=width-50, top=height-100)
+  text_rect = textpage.get_text_bounded(left=50, bottom=100, right=width-50, top=height-100)
+  # Extract text from a specific char range
+  text_span = textpage.get_text_range(index=10, count=15)
   
   # Locate text on the page
   searcher = textpage.search("something", match_case=False, match_whole_word=False)
   # This returns the next occurrence as (char_index, char_count), or None if not found
-  first_occurrence = searcher.get_next()
+  match = searcher.get_next()
   ```
 
-<!-- TOC API will change with the next major release -->
 * Read the table of contents
   ```python
-  for item in pdf.get_toc():
-      state = "*" if item.n_kids == 0 else "-" if item.is_closed else "+"
-      target = "?" if item.page_index is None else item.page_index+1
-      print(
-          "    " * item.level +
-          "[%s] %s -> %s  # %s %s" % (
-              state, item.title, target, item.view_mode, item.view_pos,
-          )
+  import pypdfium2.internal as pdfium_i
+  
+  for bm in pdf.get_toc(max_depth=15):
+      count, dest = bm.get_count(), bm.get_dest()
+      out = "    " * bm.level
+      out += "[%s] %s -> " % (
+          f"{count:+}" if count != 0 else "*",
+          bm.get_title(),
       )
+      if dest:
+          index, (view_mode, view_pos) = dest.get_index(), dest.get_view()
+          out += "%s  # %s %s" % (
+              index+1 if index != None else "?",
+              pdfium_i.ViewmodeToStr.get(view_mode),
+              round(view_pos, 3),
+          )
+      else:
+          out += "_"
+      print(out)
   ```
 
 * Create a new PDF with an empty A4 sized page
diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 352bf87f1..152aa8582 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -22,7 +22,7 @@
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
-- Simplified version implementation (no API change expected). All attributes are now assigned and show up in `dir(...)`, instead of `__getattr__` magic.
+- Simplified version implementation (no API change expected). Replaced `__getattr__` magic with assignments, so all attributes now show up in `dir()`.
 
 <!-- TODO
 See https://github.com/pypdfium2-team/pypdfium2/blob/devel_old/docs/devel/changelog_staging.md
diff --git a/src/pypdfium2/_cli/pageobjects.py b/src/pypdfium2/_cli/pageobjects.py
index dcfccb011..176f5c44d 100644
--- a/src/pypdfium2/_cli/pageobjects.py
+++ b/src/pypdfium2/_cli/pageobjects.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
+# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
 # TODO test-confirm filter and info params
@@ -6,8 +6,7 @@
 from collections import OrderedDict
 import pypdfium2._helpers as pdfium
 import pypdfium2.internal as pdfium_i
-import pypdfium2.raw as pdfium_r
-# CONSIDER dotted access
+import pypdfium2.raw as pdfium_c
 from pypdfium2._cli._parsers import (
     add_input,
     add_n_digits,
@@ -96,7 +95,7 @@ def main(args):
             if show_pos:
                 bounds = round_list(obj.get_bounds(), args.n_digits)
                 print(pad_1 + f"Bounding Box: {bounds}")
-                if obj.type in (pdfium_r.FPDF_PAGEOBJ_IMAGE, pdfium_r.FPDF_PAGEOBJ_TEXT):
+                if obj.type in (pdfium_c.FPDF_PAGEOBJ_IMAGE, pdfium_c.FPDF_PAGEOBJ_TEXT):
                     quad_bounds = obj.get_quad_points()
                     print(pad_1 + f"Quad Points: {[round_list(p, args.n_digits) for p in quad_bounds]}")
             
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index fb846d318..6005e55c9 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -77,10 +77,10 @@ def parent(self):  # AutoCloseable hook
     
     def get_bounds(self):
         """
-        Get the position of the object on the page.
+        Get the bounds of the object on the page.
         
         Returns:
-            A tuple of four :class:`float` coordinates for left, bottom, right, and top.
+            tuple[float * 4]: Left, bottom, right and top, in PDF page coordinates.
         """
         if self.page is None:
             raise RuntimeError("Must not call get_bounds() on a loose pageobject.")

From c0fdc77ac8ff0e5ce2dd9d49391111ef64e660d1 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sat, 6 Apr 2024 21:01:48 +0200
Subject: [PATCH 024/140] doc/comments

---
 docs/source/python_api.rst         | 3 ---
 src/pypdfium2/_helpers/document.py | 8 +++-----
 src/pypdfium2/_helpers/matrix.py   | 4 ++--
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/docs/source/python_api.rst b/docs/source/python_api.rst
index 4a63522c7..9e2c12399 100644
--- a/docs/source/python_api.rst
+++ b/docs/source/python_api.rst
@@ -76,9 +76,6 @@ Version
 
 .. automodule:: pypdfium2.version
 
-.. deprecated:: 4.22
-   The legacy members ``V_PYPDFIUM2, V_LIBPDFIUM, V_BUILDNAME, V_PDFIUM_IS_V8, V_LIBPDFIUM_FULL`` will be removed in version 5.
-
 Document
 ********
 .. automodule:: pypdfium2._helpers.document
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 75ef2ef90..80ac52517 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -510,7 +510,7 @@ def get_toc(
         
         bm_ptr = pdfium_c.FPDFBookmark_GetFirstChild(self, parent)
         
-        # NOTE We need bool(ptr) here to handle cases where .contents is a null pointer (raises exception on access). Don't use ptr != None, it's always true.
+        # NOTE We need bool(ptr) here to handle null pointers (where accessing .contents would raise an exception). Don't use ptr != None, it's always true.
         while bm_ptr:
             
             address = ctypes.addressof(bm_ptr.contents)
@@ -586,10 +586,8 @@ def as_pageobject(self):
             Pageobjects created from an XObject remain valid after the XObject is closed.
         """
         raw_pageobj = pdfium_c.FPDF_NewFormObjectFromXObject(self)
-        return PdfObject(  # not a child object (see above)
-            raw = raw_pageobj,
-            pdf = self.pdf,
-        )
+        # not a child object (see above)
+        return PdfObject(raw=raw_pageobj, pdf=self.pdf)
 
 
 def _open_pdf(input_data, password, autoclose):
diff --git a/src/pypdfium2/_helpers/matrix.py b/src/pypdfium2/_helpers/matrix.py
index a8cab515d..523f74fa2 100644
--- a/src/pypdfium2/_helpers/matrix.py
+++ b/src/pypdfium2/_helpers/matrix.py
@@ -21,7 +21,7 @@ class PdfMatrix:
     Note:
         * The PDF format uses row vectors.
         * Transformations operate from the origin of the coordinate system
-          (PDF coordinates: bottom left corner, Device coordinates: top left corner).
+          (PDF coordinates: commonly bottom left, but can be any corner in principle. Device coordinates: top left).
         * Matrix calculations are implemented independently in Python.
         * Matrix objects are immutable, so transforming methods return a new matrix.
         * Matrix objects implement ctypes auto-conversion to ``FS_MATRIX`` for easy use as C function parameter.
@@ -130,7 +130,7 @@ def mirror(self, v, h):
         """
         Parameters:
             v (bool): Whether to mirror vertically (at the Y axis).
-            h (bool): Whether to mirror horizontall (at the X axis).
+            h (bool): Whether to mirror horizontally (at the X axis).
         """
         return self.scale(x=(-1 if v else 1), y=(-1 if h else 1))
     

From c9b8f1527c0cfb8ef379498e17192928988fa865 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 8 Apr 2024 14:25:12 +0200
Subject: [PATCH 025/140] style nits

---
 src/pypdfium2/_helpers/document.py    |  2 +-
 src/pypdfium2/_helpers/matrix.py      |  2 +-
 src/pypdfium2/_helpers/pageobjects.py | 15 +++++++++------
 src/pypdfium2/internal/bases.py       |  2 +-
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 80ac52517..b9c6ccdc3 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -696,4 +696,4 @@ def get_view(self):
         pos = (pdfium_c.FS_FLOAT * 4)()
         mode = pdfium_c.FPDFDest_GetView(self, n_params, pos)
         pos = list(pos)[:n_params.value]
-        return (mode, pos)
+        return mode, pos
diff --git a/src/pypdfium2/_helpers/matrix.py b/src/pypdfium2/_helpers/matrix.py
index 523f74fa2..de489857b 100644
--- a/src/pypdfium2/_helpers/matrix.py
+++ b/src/pypdfium2/_helpers/matrix.py
@@ -46,7 +46,7 @@ def __repr__(self):
     def __eq__(self, other):
         if type(self) is not type(other):
             return False
-        return (self.get() == other.get())
+        return self.get() == other.get()
     
     @property
     def _as_parameter_(self):
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 6005e55c9..47371257f 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -232,7 +232,10 @@ def load_jpeg(self, source, pages=None, inline=False, autoclose=True):
             raise ValueError(f"Cannot load JPEG from {source} - not a file path or byte buffer.")
         
         bufaccess, to_hold = pdfium_i.get_bufreader(buffer)
-        loader = pdfium_c.FPDFImageObj_LoadJpegFileInline if inline else pdfium_c.FPDFImageObj_LoadJpegFile
+        loader = {
+            False: pdfium_c.FPDFImageObj_LoadJpegFile,
+            True: pdfium_c.FPDFImageObj_LoadJpegFileInline,
+        }[inline]
         
         c_pages, page_count = pdfium_i.pages_c_array(pages)
         ok = loader(c_pages, page_count, self, bufaccess)
@@ -300,7 +303,10 @@ def get_data(self, decode_simple=False):
         Returns:
             ctypes.Array: The data of the image stream (as :class:`~ctypes.c_ubyte` array).
         """
-        func = pdfium_c.FPDFImageObj_GetImageDataDecoded if decode_simple else pdfium_c.FPDFImageObj_GetImageDataRaw
+        func = {
+            False: pdfium_c.FPDFImageObj_GetImageDataRaw,
+            True: pdfium_c.FPDFImageObj_GetImageDataDecoded,
+        }[decode_simple]
         n_bytes = func(self, None, 0)
         buffer = (ctypes.c_ubyte * n_bytes)()
         func(self, buffer, n_bytes)
@@ -375,10 +381,7 @@ class ImageNotExtractableError (Exception):
 def _get_pil_mode(colorspace, bpp):
     # In theory, indexed (palettized) and ICC-based color spaces could be handled as well, but PDFium currently does not provide access to the palette or the ICC profile
     if colorspace == pdfium_c.FPDF_COLORSPACE_DEVICEGRAY:
-        if bpp == 1:
-            return "1"
-        else:
-            return "L"
+        return "1" if bpp == 1 else "L"
     elif colorspace == pdfium_c.FPDF_COLORSPACE_DEVICERGB:
         return "RGB"
     elif colorspace == pdfium_c.FPDF_COLORSPACE_DEVICECMYK:
diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py
index 010e11112..be64b9cfe 100644
--- a/src/pypdfium2/internal/bases.py
+++ b/src/pypdfium2/internal/bases.py
@@ -80,7 +80,7 @@ def _detach_finalizer(self):
     def _tree_closed(self):
         if self.raw is None:
             return True
-        if (self.parent is not None) and self.parent._tree_closed():
+        if self.parent != None and self.parent._tree_closed():
             return True
         return False
     

From e81c8d1c9d6824b0e04d4422830f88dc48b09800 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 8 Apr 2024 20:25:45 +0200
Subject: [PATCH 026/140] Take over PdfPosConv, with design explanation
 (untested)

---
 src/pypdfium2/_helpers/bitmap.py      |  5 +++
 src/pypdfium2/_helpers/page.py        | 56 ++++++++++++++++++++++++++-
 src/pypdfium2/_helpers/pageobjects.py |  3 +-
 src/pypdfium2/internal/bases.py       |  3 ++
 4 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index b1de99244..544ebff70 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -55,6 +55,7 @@ class PdfBitmap (pdfium_i.AutoCloseable):
     """
     
     def __init__(self, raw, buffer, width, height, stride, format, rev_byteorder, needs_free):
+        
         self.raw = raw
         self.buffer = buffer
         self.width = width
@@ -67,6 +68,10 @@ def __init__(self, raw, buffer, width, height, stride, format, rev_byteorder, ne
             False: pdfium_i.BitmapTypeToStr,
             True: pdfium_i.BitmapTypeToStrReverse,
         }[self.rev_byteorder][self.format]
+        
+        # slot to store arguments for PdfPosConv, set on page rendering
+        self._pos_args = None
+        
         super().__init__(pdfium_c.FPDFBitmap_Destroy, needs_free=needs_free, obj=self.buffer)
     
     
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index fff394f4f..1dc764622 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -6,6 +6,7 @@
 import math
 import ctypes
 import logging
+import weakref
 import pypdfium2.raw as pdfium_c
 import pypdfium2.internal as pdfium_i
 from pypdfium2._helpers.misc import PdfiumError
@@ -418,13 +419,15 @@ def render(
         bitmap = bitmap_maker(width, height, format=cl_format, rev_byteorder=rev_byteorder)
         bitmap.fill_rect(0, 0, width, height, fill_color)
         
-        render_args = (bitmap, self, -crop[0], -crop[3], src_width, src_height, pdfium_i.RotationToConst[rotation], flags)
+        pos_args = (-crop[0], -crop[3], src_width, src_height, pdfium_i.RotationToConst[rotation])
+        render_args = (bitmap, self, *pos_args, flags)
         
         pdfium_c.FPDF_RenderPageBitmap(*render_args)
-        
         if may_draw_forms and self.formenv:
             pdfium_c.FPDF_FFLDraw(self.formenv, *render_args)
         
+        bitmap._pos_args = (weakref.ref(self), *pos_args)
+        
         return bitmap
 
 
@@ -494,3 +497,52 @@ def _parse_renderopts(
     
     # TODO consider using a namedtuple or something
     return cl_format, rev_byteorder, fill_color, flags
+
+
+class PdfPosConv:  # TODO add to test suite
+    """
+    Pdf coordinate translator.
+    
+    Parameters:
+        page (PdfPage):
+            Handle to the page.
+        bitmap (PdfBitmap):
+            Handle to the bitmap, which must be a rendering of *page*.
+    """
+    
+    # NOTE The reason for this API design is that neither page nor bitmap should hold a permanent reference to another, so they can be freed independently via finalizer. Obviously, a weak reference alone is not sufficient, as its object can disappear. So we need an explicit takeover of the page, ensuring it is held in memory.
+    
+    def __init__(self, page, bitmap):
+        
+        if not bitmap._pos_args:
+            raise RuntimeError("This bitmap does not belong to a page.")
+        
+        assert page != None
+        page_ref = bitmap._pos_args[0]
+        if page_ref() is not page:  # resolve weakref and check identity
+            raise RuntimeError("This bitmap was not rendered from the given page.")
+        
+        self.page = page
+        self._args = bitmap._pos_args[1:]
+    
+    
+    def to_page(self, bitmap_x, bitmap_y):
+        """
+        Translate coordinates from bitmap to page.
+        """
+        page_x, page_y = ctypes.c_double(), ctypes.c_double()
+        ok = pdfium_c.FPDF_DeviceToPage(self.page, *self._args, bitmap_x, bitmap_y, page_x, page_y)
+        if not ok:
+            raise PdfiumError("Failed to translate to page coordinates.")
+        return (page_x.value, page_y.value)
+    
+    
+    def to_bitmap(self, page_x, page_y):
+        """
+        Translate coordinates from page to bitmap.
+        """
+        bitmap_x, bitmap_y = ctypes.c_int(), ctypes.c_int()
+        ok = pdfium_c.FPDF_PageToDevice(self.page, *self._args, page_x, page_y, bitmap_x, bitmap_y)
+        if not ok:
+            raise PdfiumError("Failed to translate to bitmap coordinates.")
+        return (bitmap_x.value, bitmap_y.value)
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 47371257f..b5a46e73c 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -393,9 +393,10 @@ def _get_pil_mode(colorspace, bpp):
 def _extract_smart(image_obj, fb_format=None):
     
     try:
+        # TODO can we change PdfImage.get_data() to take an mmap, so the data could be written directly into a file rather than an in-memory array?
         data, info = _extract_direct(image_obj)
     except ImageNotExtractableError:
-        # TODO? log reason why the image cannot be extracted directly
+        # TODO log reason why the image cannot be extracted directly?
         pil_image = image_obj.get_bitmap(render=False).to_pil()
     else:
         pil_image = None
diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py
index be64b9cfe..e1a70dcdb 100644
--- a/src/pypdfium2/internal/bases.py
+++ b/src/pypdfium2/internal/bases.py
@@ -26,6 +26,9 @@ class AutoCastable:
     
     @property
     def _as_parameter_(self):
+        # TODO tighten to `not isinstance(...)` (needs declaraction of C type)
+        if not self.raw:
+            raise RuntimeError("Cannot use closed object as C function parameter.")
         return self.raw
 
 

From d501ac2fbf11bce61319aecd7611bc60fc8ec51c Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 8 Apr 2024 21:01:54 +0200
Subject: [PATCH 027/140] Start merging back tests and tests_old

This split, and merging the incomplete rewrite attempt, was one of the
worst mistakes I made in pypdfium2's history.
---
 .github/workflows/conda.yaml        |   2 +-
 .gitignore                          |   1 -
 run                                 |  10 +-
 tests/conftest.py                   |  39 ++-
 tests/test_attachments.py           |   6 +-
 tests/test_bitmap.py                |   2 -
 tests/test_cli.py                   |  26 +-
 tests/test_document.py              |  42 ++-
 tests/test_matrix.py                |   2 -
 tests/test_nup.py                   |   4 +-
 {tests_old => tests}/test_opener.py |   2 +
 tests/test_page.py                  |  62 +++++
 tests/test_pageobjects.py           | 264 +++++++++++++++++++
 tests/test_rendering.py             | 379 +++++++++++++++++++++++++++
 tests/test_saving.py                |  89 +++++--
 tests/test_textpage.py              | 139 +++++++++-
 tests/test_toc.py                   |   6 +-
 tests_old/__init__.py               |   2 -
 tests_old/conftest.py               |  67 -----
 tests_old/output/.gitkeep           |   1 -
 tests_old/resources                 |   1 -
 tests_old/test_page.py              |  64 -----
 tests_old/test_pageobject.py        | 266 -------------------
 tests_old/test_renderer.py          | 381 ----------------------------
 tests_old/test_saver.py             |  55 ----
 tests_old/test_text.py              | 146 -----------
 26 files changed, 995 insertions(+), 1063 deletions(-)
 delete mode 100644 tests/test_bitmap.py
 delete mode 100644 tests/test_matrix.py
 rename {tests_old => tests}/test_opener.py (99%)
 delete mode 100644 tests_old/__init__.py
 delete mode 100644 tests_old/conftest.py
 delete mode 100644 tests_old/output/.gitkeep
 delete mode 120000 tests_old/resources
 delete mode 100644 tests_old/test_page.py
 delete mode 100644 tests_old/test_pageobject.py
 delete mode 100644 tests_old/test_renderer.py
 delete mode 100644 tests_old/test_saver.py
 delete mode 100644 tests_old/test_text.py

diff --git a/.github/workflows/conda.yaml b/.github/workflows/conda.yaml
index b94afb711..1fc50115c 100644
--- a/.github/workflows/conda.yaml
+++ b/.github/workflows/conda.yaml
@@ -120,7 +120,7 @@ jobs:
         run: |
           conda install -y pytest pillow numpy
           conda install -y pypdfium2_${{ inputs.package }} --override-channels -c ./conda_dist/ -c pypdfium2-team -c bblanchon
-          pytest tests/ tests_old/
+          pytest tests/
   
   publish:
     
diff --git a/.gitignore b/.gitignore
index acc9141c6..d056b320f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,6 @@ build/
 dist/
 conda/*/out/
 tests/output/
-tests_old/output/
 
 data/
 !data/.gitkeep
diff --git a/run b/run
index 61eb2feac..e7c51bc7a 100755
--- a/run
+++ b/run
@@ -8,13 +8,13 @@
 args="${@:2}"
 
 function check() {
-    autoflake src/ setupsrc/ tests/ tests_old/ setup.py docs/source/conf.py --recursive --remove-all-unused-imports --ignore-pass-statements --ignore-init-module-imports
-    codespell --skip="./docs/build,./tests/resources,./tests/output,./tests_old/output,./data,./sourcebuild,./dist,./.git,__pycache__,.mypy_cache,.hypothesis" -L "tabe,splitted,fith,flate"
+    autoflake src/ setupsrc/ tests/ setup.py docs/source/conf.py --recursive --remove-all-unused-imports --ignore-pass-statements --ignore-init-module-imports
+    codespell --skip="./docs/build,./tests/resources,./tests/output,./data,./sourcebuild,./dist,./.git,__pycache__,.mypy_cache,.hypothesis" -L "tabe,splitted,fith,flate"
     reuse lint
 }
 
 function clean() {
-    rm -rf pypdfium2*.egg-info/ src/pypdfium2*.egg-info/ build/ dist/ data/* tests/output/* tests_old/output/* conda/bundle/out/ conda/helpers/out/ conda/raw/out/
+    rm -rf pypdfium2*.egg-info/ src/pypdfium2*.egg-info/ build/ dist/ data/* tests/output/* conda/bundle/out/ conda/helpers/out/ conda/raw/out/
 }
 
 function packaging_pypi() {
@@ -35,10 +35,10 @@ set -x
 case $1 in
 
 test)
-    python3 -m pytest tests/ tests_old/ $args;;
+    python3 -m pytest tests/ $args;;
 
 coverage)
-    python3 -m coverage run --omit "tests/*,tests_old/*,src/pypdfium2_raw/bindings.py,setupsrc/*" -m pytest tests/ tests_old/ $args
+    python3 -m coverage run --omit "tests/*,src/pypdfium2_raw/bindings.py,setupsrc/*" -m pytest tests/ $args
     python3 -m coverage report;;
 
 docs-build)
diff --git a/tests/conftest.py b/tests/conftest.py
index 5a311a54f..0c7559351 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,6 +6,8 @@
 from argparse import Namespace
 import pypdfium2.__main__ as pdfium_cli
 
+PyVersion = (sys.version_info.major, sys.version_info.minor)
+
 
 pdfium_cli.setup_logging()
 
@@ -28,5 +30,40 @@ def _gather_resources(dir, skip_exts=[".in"]):
     return test_files
 
 
-TestResources = _gather_resources(ResourceDir)
+TestFiles = _gather_resources(ResourceDir)
 TestExpectations = _gather_resources(ExpectationsDir)
+
+
+ExpRenderPixels = (
+    ( (0,   0  ), (255, 255, 255) ),
+    ( (150, 180), (129, 212, 26 ) ),
+    ( (150, 390), (42,  96,  153) ),
+    ( (150, 570), (128, 0,   128) ),
+)
+
+
+def get_members(cls):
+    members = []
+    for attr in dir(cls):
+        if attr.startswith("_"):
+            continue
+        members.append( getattr(cls, attr) )
+    return members
+
+
+# def iterate_testfiles(skip_encrypted=True):
+#     encrypted = (TestFiles.encrypted, )
+#     for attr_name in dir(TestFiles):
+#         if attr_name.startswith("_"):
+#             continue
+#         member = getattr(TestFiles, attr_name)
+#         if skip_encrypted and member in encrypted:
+#             continue
+#         yield member
+#
+#
+# def test_testpaths():
+#     for dirpath in (TestDir, ProjectDir, ResourceDir, OutputDir):
+#         assert dirpath.is_dir()
+#     for filepath in iterate_testfiles(False):
+#         assert filepath.is_file()
diff --git a/tests/test_attachments.py b/tests/test_attachments.py
index 2af663960..35c4ffae7 100644
--- a/tests/test_attachments.py
+++ b/tests/test_attachments.py
@@ -7,12 +7,12 @@
 import hashlib
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
-from .conftest import TestResources, OutputDir
+from .conftest import TestFiles, OutputDir
 
 
 def test_attachment():
     
-    pdf = pdfium.PdfDocument(TestResources.attachments)
+    pdf = pdfium.PdfDocument(TestFiles.attachments)
     assert pdf.count_attachments() == 2
     
     attachment_a = pdf.get_attachment(0)
@@ -69,7 +69,7 @@ def test_attachment():
     with pytest.raises(pdfium.PdfiumError, match=re.escape("Failed to extract attachment (buffer length 0).")):
         attachment_c.get_data()
     
-    data_c = TestResources.mona_lisa.read_bytes()
+    data_c = TestFiles.mona_lisa.read_bytes()
     attachment_c.set_data(data_c)
     assert attachment_c.get_data().raw == data_c
     
diff --git a/tests/test_bitmap.py b/tests/test_bitmap.py
deleted file mode 100644
index b4fb1b2f0..000000000
--- a/tests/test_bitmap.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
-# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 4c05bf5d1..16ece0fb7 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -10,7 +10,7 @@
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 import pypdfium2.__main__ as pdfium_cli
-from .conftest import TestResources, TestExpectations
+from .conftest import TestFiles, TestExpectations
 
 lib_logger = logging.getLogger("pypdfium2")
 
@@ -75,18 +75,18 @@ def _get_text(pdf, index):
 
 @pytest.mark.parametrize("resource", ["toc", "toc_viewmodes", "toc_circular", "toc_maxdepth"])
 def test_toc(resource):
-    run_cli(["toc", getattr(TestResources, resource)], getattr(TestExpectations, resource))
+    run_cli(["toc", getattr(TestFiles, resource)], getattr(TestExpectations, resource))
 
 
 def test_attachments(tmp_path):
     
-    run_cli(["attachments", TestResources.attachments, "list"], TestExpectations.attachments_list)
+    run_cli(["attachments", TestFiles.attachments, "list"], TestExpectations.attachments_list)
     
-    run_cli(["attachments", TestResources.attachments, "extract", "-o", tmp_path])
+    run_cli(["attachments", TestFiles.attachments, "extract", "-o", tmp_path])
     assert _get_files(tmp_path) == ["1_1.txt", "2_attached.pdf"]
     
     edited_pdf = tmp_path / "edited.pdf"
-    run_cli(["attachments", TestResources.attachments, "edit", "--del-numbers", "1,2", "--add-files", TestResources.mona_lisa, "-o", edited_pdf])
+    run_cli(["attachments", TestFiles.attachments, "edit", "--del-numbers", "1,2", "--add-files", TestFiles.mona_lisa, "-o", edited_pdf])
     run_cli(["attachments", edited_pdf, "list"], "[1] mona_lisa.jpg\n", capture=["out"])
 
 
@@ -96,33 +96,33 @@ def test_images(tmp_path):
     output_dir = tmp_path / "out"
     output_dir.mkdir()
     
-    run_cli(["imgtopdf", TestResources.mona_lisa, "-o", img_pdf])
+    run_cli(["imgtopdf", TestFiles.mona_lisa, "-o", img_pdf])
     run_cli(["extract-images", img_pdf, "-o", output_dir])
     
     output_name = "img_pdf_1_1.jpg"
     assert _get_files(output_dir) == [output_name]
-    assert filecmp.cmp(TestResources.mona_lisa, output_dir/output_name)
+    assert filecmp.cmp(TestFiles.mona_lisa, output_dir/output_name)
 
 
 @pytest.mark.parametrize("strategy", ["range", "bounded"])
 def test_extract_text(strategy):
-    run_cli(["extract-text", TestResources.text, "--strategy", strategy], TestExpectations.text_extract, normalize_lfs=True)
+    run_cli(["extract-text", TestFiles.text, "--strategy", strategy], TestExpectations.text_extract, normalize_lfs=True)
 
 
 @pytest.mark.parametrize("resource", ["multipage", "attachments", "forms"])
 def test_pdfinfo(resource):
-    run_cli(["pdfinfo", getattr(TestResources, resource)], getattr(TestExpectations, "pdfinfo_%s" % resource))
+    run_cli(["pdfinfo", getattr(TestFiles, resource)], getattr(TestExpectations, "pdfinfo_%s" % resource))
 
 
 @pytest.mark.parametrize("resource", ["images"])
 def test_pageobjects(resource):
-    run_cli(["pageobjects", getattr(TestResources, resource)], getattr(TestExpectations, "pageobjects_%s" % resource))
+    run_cli(["pageobjects", getattr(TestFiles, resource)], getattr(TestExpectations, "pageobjects_%s" % resource))
 
 
 def test_arrange(tmp_path):
     
     out = tmp_path / "out.pdf"
-    run_cli(["arrange", TestResources.multipage, TestResources.encrypted, TestResources.empty, "--pages", "1,3", "--passwords", "_", "test_user", "-o", out])
+    run_cli(["arrange", TestFiles.multipage, TestFiles.encrypted, TestFiles.empty, "--pages", "1,3", "--passwords", "_", "test_user", "-o", out])
     
     pdf = pdfium.PdfDocument(out)
     assert len(pdf) == 4
@@ -134,7 +134,7 @@ def test_arrange(tmp_path):
 def test_tile(tmp_path):
     
     out = tmp_path / "out.pdf"
-    run_cli(["tile", TestResources.multipage, "-r", 2, "-c", 2, "--width", 21.0, "--height", 29.7, "-u", "cm", "-o", out])
+    run_cli(["tile", TestFiles.multipage, "-r", 2, "-c", 2, "--width", 21.0, "--height", 29.7, "-u", "cm", "-o", out])
     
     pdf = pdfium.PdfDocument(out)
     assert len(pdf) == 1
@@ -149,7 +149,7 @@ def test_render_multipage(tmp_path):
     out_dir = tmp_path / "out"
     out_dir.mkdir()
     
-    run_cli(["render", TestResources.multipage, "-o", out_dir, "--scale", 0.2, "-f", "jpg"])
+    run_cli(["render", TestFiles.multipage, "-o", out_dir, "--scale", 0.2, "-f", "jpg"])
     
     out_files = list(out_dir.iterdir())
     assert sorted([f.name for f in out_files]) == ["multipage_1.jpg", "multipage_2.jpg", "multipage_3.jpg"]
diff --git a/tests/test_document.py b/tests/test_document.py
index 8eb2a3f55..d7bc54bf0 100644
--- a/tests/test_document.py
+++ b/tests/test_document.py
@@ -1,17 +1,19 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
+# TODO test formenv and page deletion
+
 import re
 import ctypes
 import pathlib
 import pytest
-from .conftest import TestResources
+from .conftest import TestFiles
 
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 
 
-parametrize_opener_files = pytest.mark.parametrize("file", [TestResources.empty])
+parametrize_opener_files = pytest.mark.parametrize("file", [TestFiles.empty])
 
 
 def _check_pdf(pdf):
@@ -84,7 +86,7 @@ def test_open_ctypes_array(file):
 
 def test_open_raw():
     # not meant for embedders, but works for testing all the same
-    pdf = pdfium.PdfDocument(TestResources.empty)
+    pdf = pdfium.PdfDocument(TestFiles.empty)
     pdf._finalizer.detach()
     input = pdf.raw
     assert isinstance(input, pdfium_c.FPDF_DOCUMENT)
@@ -115,7 +117,7 @@ def _make_encryption_cases(file, passwords):
 
 @pytest.mark.parametrize(
     ["input", "password"],
-    _make_encryption_cases(TestResources.encrypted, ["test_user", "test_owner"]),
+    _make_encryption_cases(TestFiles.encrypted, ["test_user", "test_owner"]),
 )
 def test_open_encrypted(input, password):
     pdf = pdfium.PdfDocument(input, password, autoclose=True)
@@ -124,7 +126,7 @@ def test_open_encrypted(input, password):
 
 @pytest.mark.parametrize(
     ["input", "password"],
-    _make_encryption_cases(TestResources.empty, ["superfluous"]),
+    _make_encryption_cases(TestFiles.empty, ["superfluous"]),
 )
 def test_open_with_excessive_password(input, password):
     pdf = pdfium.PdfDocument(input, password, autoclose=True)
@@ -137,11 +139,11 @@ def test_open_invalid():
     with pytest.raises(FileNotFoundError):
         pdf = pdfium.PdfDocument("invalid/path")
     with pytest.raises(pdfium.PdfiumError, match=re.escape("Failed to load document (PDFium: Incorrect password error).")):
-        pdf = pdfium.PdfDocument(TestResources.encrypted, password="wrong_password")
+        pdf = pdfium.PdfDocument(TestFiles.encrypted, password="wrong_password")
 
 
 def test_misc():
-    pdf = pdfium.PdfDocument(TestResources.empty)
+    pdf = pdfium.PdfDocument(TestFiles.empty)
     assert pdf.get_formtype() == pdfium_c.FORMTYPE_NONE
     assert pdf.get_version() == 15
     assert pdf.get_identifier(pdfium_c.FILEIDTYPE_PERMANENT) == b"\xec\xe5!\x04\xd6\x1b(R\x1a\x89f\x85\n\xbe\xa4"
@@ -154,7 +156,7 @@ def test_misc():
 
 def test_page_labels():
     # incidentally, it happens that this TOC test file also has page labels
-    pdf = pdfium.PdfDocument(TestResources.toc_viewmodes)
+    pdf = pdfium.PdfDocument(TestFiles.toc_viewmodes)
     exp_labels = ["i", "ii", "appendix-C", "appendix-D", "appendix-E", "appendix-F", "appendix-G", "appendix-H"]
     assert exp_labels == [pdf.get_page_label(i) for i in range(len(pdf))]
 
@@ -173,7 +175,7 @@ def _compare_metadata(pdf, metadata, exp_metadata):
 
 
 def test_metadata_dict():
-    pdf = pdfium.PdfDocument(TestResources.empty)
+    pdf = pdfium.PdfDocument(TestFiles.empty)
     metadata = pdf.get_metadata_dict()
     exp_metadata = {
         "Producer": "LibreOffice 6.4",
@@ -203,23 +205,19 @@ def test_new_page_on_new_pdf(new_pages):
     ]
 )
 def test_new_page_on_existing_pdf(new_pages):
-    pdf = pdfium.PdfDocument(TestResources.multipage)
+    pdf = pdfium.PdfDocument(TestFiles.multipage)
     for index, size in new_pages:
         page = pdf.new_page(*size, index=index)
         if index is None:
             index = len(pdf) - 1
         assert page.get_size() == pdf.get_page_size(index) == size
-    
-
-def test_del_page():
-    pass
 
 
 ImportTestSequence = [
-    (TestResources.empty, None, None, 1),
-    (TestResources.empty, "", 0, 1),
-    (TestResources.multipage, [1, 0, 1, 2, 1], 1, 5),
-    (TestResources.multipage, "2,1-3, 2", 4, 5),
+    (TestFiles.empty, None, None, 1),
+    (TestFiles.empty, "", 0, 1),
+    (TestFiles.multipage, [1, 0, 1, 2, 1], 1, 5),
+    (TestFiles.multipage, "2,1-3, 2", 4, 5),
 ]
 
 @pytest.mark.parametrize("sequence", [ImportTestSequence])
@@ -234,13 +232,9 @@ def test_import_pages(sequence):
         assert len(dest_pdf) == exp_len
 
 
-def test_formenv():
-    pass
-
-
 def test_closing_parent_closes_kids():
     
-    pdf = pdfium.PdfDocument(TestResources.multipage)
+    pdf = pdfium.PdfDocument(TestFiles.multipage)
     pages = list(pdf)
     assert len(pages) == 3
     pdf.close()
@@ -251,7 +245,7 @@ def test_closing_parent_closes_kids():
 
 
 def test_post_close():
-    pdf = pdfium.PdfDocument(TestResources.empty)
+    pdf = pdfium.PdfDocument(TestFiles.empty)
     pdf.close()
     with pytest.raises(ctypes.ArgumentError):
         pdf.get_version()
diff --git a/tests/test_matrix.py b/tests/test_matrix.py
deleted file mode 100644
index b4fb1b2f0..000000000
--- a/tests/test_matrix.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
-# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
diff --git a/tests/test_nup.py b/tests/test_nup.py
index 1d1f43bcb..5a20f1b91 100644
--- a/tests/test_nup.py
+++ b/tests/test_nup.py
@@ -4,14 +4,14 @@
 import pytest
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
-from .conftest import TestResources, OutputDir
+from .conftest import TestFiles, OutputDir
 
 
 def test_xobject_placement():
     
     # basic test to at least run through the code
     
-    src_pdf = pdfium.PdfDocument(TestResources.multipage)
+    src_pdf = pdfium.PdfDocument(TestFiles.multipage)
     dest_pdf = pdfium.PdfDocument.new()
     xobject = src_pdf.page_as_xobject(0, dest_pdf)
     
diff --git a/tests_old/test_opener.py b/tests/test_opener.py
similarity index 99%
rename from tests_old/test_opener.py
rename to tests/test_opener.py
index 4e0c0b186..4c22d08c0 100644
--- a/tests_old/test_opener.py
+++ b/tests/test_opener.py
@@ -1,6 +1,8 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
+# FIXME merge with test_document and deduplicate
+
 import re
 import shutil
 import tempfile
diff --git a/tests/test_page.py b/tests/test_page.py
index b4fb1b2f0..9bde27c0a 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -1,2 +1,64 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
+
+import pytest
+import pypdfium2 as pdfium
+# import pypdfium2.raw as pdfium_c
+from .conftest import TestFiles
+
+
+def test_boxes():
+    
+    pdf = pdfium.PdfDocument(TestFiles.render)
+    index = 0
+    page = pdf[index]
+    assert page.get_size() == pdf.get_page_size(index) == (595, 842)
+    assert page.get_mediabox() == (0, 0, 595, 842)
+    assert isinstance(page, pdfium.PdfPage)
+    
+    test_cases = [
+        ("media", (0,  0,  612, 792)),
+        ("media", (0,  0,  595, 842)),
+        ("crop",  (10, 10, 585, 832)),
+        ("bleed", (20, 20, 575, 822)),
+        ("trim",  (30, 30, 565, 812)),
+        ("art",   (40, 40, 555, 802)),
+    ]
+    
+    for meth_name, exp_box in test_cases:
+        getattr(page, "set_%sbox" % meth_name)(*exp_box)
+        box = getattr(page, "get_%sbox" % meth_name)()
+        assert pytest.approx(box) == exp_box
+
+
+def test_mediabox_fallback():
+    pdf = pdfium.PdfDocument(TestFiles.box_fallback)
+    page = pdf[0]
+    assert page.get_mediabox() == (0, 0, 612, 792)
+
+
+def test_rotation():
+    pdf = pdfium.PdfDocument.new()
+    page = pdf.new_page(500, 800)
+    for r in (90, 180, 270, 0):
+        page.set_rotation(r)
+        assert page.get_rotation() == r
+
+
+def test_page_labels():
+    # incidentally, it happens that this TOC test file also has page labels
+    pdf = pdfium.PdfDocument(TestFiles.toc_viewmodes)
+    exp_labels = ["i", "ii", "appendix-C", "appendix-D", "appendix-E", "appendix-F", "appendix-G", "appendix-H"]
+    assert exp_labels == [pdf.get_page_label(i) for i in range(len(pdf))]
+
+
+# # disabled because flattening takes no effect
+# def test_flatten():
+    
+#     pdf = pdfium.PdfDocument(TestFiles.form)
+#     page = pdf[0]
+    
+#     rc = page._flatten()
+#     assert rc == pdfium_c.FLATTEN_SUCCESS
+    
+#     # pdf.save(OutputDir / "flattened.pdf")
diff --git a/tests/test_pageobjects.py b/tests/test_pageobjects.py
index b4fb1b2f0..b7e8451f3 100644
--- a/tests/test_pageobjects.py
+++ b/tests/test_pageobjects.py
@@ -1,2 +1,266 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
+
+import io
+import re
+import pytest
+import PIL.Image
+import pypdfium2 as pdfium
+import pypdfium2.raw as pdfium_c
+from .conftest import TestFiles, OutputDir
+
+
+def compare_n2(data, exp_data):
+    assert len(data) == len(exp_data)
+    for d, exp_d in zip(data, exp_data):
+        assert pytest.approx(d, abs=1) == exp_d
+
+
+def test_image_objects():
+    pdf = pdfium.PdfDocument(TestFiles.images)
+    page = pdf[0]
+    assert page.pdf is pdf
+    
+    images = list( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
+    assert len(images) == 3
+    
+    img_0 = images[0]
+    assert isinstance(img_0, pdfium.PdfObject)
+    assert type(img_0) is pdfium.PdfImage
+    assert img_0.type == pdfium_c.FPDF_PAGEOBJ_IMAGE
+    assert isinstance(img_0.raw, pdfium_c.FPDF_PAGEOBJECT)
+    assert img_0.level == 0
+    assert img_0.page is page
+    assert img_0.pdf is pdf
+    
+    positions = [img.get_bounds() for img in images]
+    exp_positions = [
+        (133, 459, 350, 550),
+        (48, 652, 163, 700),
+        (204, 204, 577, 360),
+    ]
+    compare_n2(positions, exp_positions)
+    
+    compare_n2(
+        img_0.get_quad_points(),
+        ((132.7, 459.2), (349.5, 459.2), (349.5, 549.7), (132.7, 549.7))
+    )
+
+
+def test_misc_objects():
+    
+    pdf = pdfium.PdfDocument(TestFiles.render)
+    page = pdf[0]
+    assert page.pdf is pdf
+    
+    for obj in page.get_objects():
+        assert type(obj) is pdfium.PdfObject
+        assert isinstance(obj.raw, pdfium_c.FPDF_PAGEOBJECT)
+        assert obj.type in (pdfium_c.FPDF_PAGEOBJ_TEXT, pdfium_c.FPDF_PAGEOBJ_PATH)
+        assert obj.level == 0
+        assert obj.page is page
+        assert obj.pdf is pdf
+        pos = obj.get_bounds()
+        assert len(pos) == 4
+    
+    text_obj = next(obj for obj in page.get_objects() if obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT)
+    path_obj = next(obj for obj in page.get_objects() if obj.type == pdfium_c.FPDF_PAGEOBJ_PATH)
+    
+    compare_n2(
+        text_obj.get_quad_points(),
+        ((57.3, 767.4), (124.2, 767.4), (124.2, 780.9), (57.3, 780.9))
+    )
+    
+    with pytest.raises(RuntimeError, match=re.escape("Quad points only supported for image and text objects.")):
+        path_obj.get_quad_points()
+
+
+def test_new_image_from_jpeg():
+    
+    pdf = pdfium.PdfDocument.new()
+    page = pdf.new_page(240, 120)
+    
+    image_a = pdfium.PdfImage.new(pdf)
+    buffer = open(TestFiles.mona_lisa, "rb")
+    image_a.load_jpeg(buffer, autoclose=True)
+    width, height = image_a.get_px_size()
+    page.insert_obj(image_a)
+    
+    assert len(pdf._data_holder) == 1
+    assert pdf._data_closer == [buffer]
+    
+    assert image_a.get_matrix() == pdfium.PdfMatrix()
+    image_a.set_matrix( pdfium.PdfMatrix().scale(width, height) )
+    assert image_a.get_matrix() == pdfium.PdfMatrix(width, 0, 0, height, 0, 0)
+    
+    pil_image_1 = PIL.Image.open(TestFiles.mona_lisa)
+    bitmap = image_a.get_bitmap()
+    pil_image_2 = bitmap.to_pil()
+    assert (120, 120) == pil_image_1.size == pil_image_2.size == (bitmap.width, bitmap.height)
+    assert "RGB" == pil_image_1.mode == pil_image_2.mode
+    
+    in_data = TestFiles.mona_lisa.read_bytes()
+    out_buffer = io.BytesIO()
+    image_a.extract(out_buffer)
+    out_buffer.seek(0)
+    out_data = out_buffer.read()
+    assert in_data == out_data
+    
+    metadata = image_a.get_metadata()
+    assert isinstance(metadata, pdfium_c.FPDF_IMAGEOBJ_METADATA)
+    assert metadata.bits_per_pixel == 24  # 3 channels, 8 bits each
+    assert metadata.colorspace == pdfium_c.FPDF_COLORSPACE_DEVICERGB
+    assert metadata.height == height == 120
+    assert metadata.width == width == 120
+    assert metadata.horizontal_dpi == 72
+    assert metadata.vertical_dpi == 72
+    
+    image_b = pdfium.PdfImage.new(pdf)
+    with open(TestFiles.mona_lisa, "rb") as buffer:
+        image_b.load_jpeg(buffer, inline=True, autoclose=False)
+    
+    assert image_b.get_matrix() == pdfium.PdfMatrix()
+    image_b.set_matrix( pdfium.PdfMatrix().scale(width, height).translate(width, 0) )
+    image_b.get_matrix() == pdfium.PdfMatrix(width, 0, 0, height, width, 0)
+    page.insert_obj(image_b)
+    
+    page.gen_content()
+    out_path = OutputDir / "image_jpeg.pdf"
+    pdf.save(out_path)
+    assert out_path.exists()
+    
+    page._finalizer()
+    pdf._finalizer()
+    assert buffer.closed is True
+
+
+def test_new_image_from_bitmap():
+    
+    src_pdf = pdfium.PdfDocument(TestFiles.render)
+    src_page = src_pdf[0]
+    dst_pdf = pdfium.PdfDocument.new()
+    image_a = pdfium.PdfImage.new(dst_pdf)
+    
+    bitmap = src_page.render()
+    w, h = bitmap.width, bitmap.height
+    image_a.set_bitmap(bitmap)
+    image_a.set_matrix( pdfium.PdfMatrix().scale(w, h) )
+    
+    pil_image = PIL.Image.open(TestFiles.mona_lisa)
+    bitmap = pdfium.PdfBitmap.from_pil(pil_image)
+    image_b = pdfium.PdfImage.new(dst_pdf)
+    image_b.set_matrix( pdfium.PdfMatrix().scale(bitmap.width, bitmap.height) )
+    image_b.set_bitmap(bitmap)
+    
+    dst_page = dst_pdf.new_page(w, h)
+    dst_page.insert_obj(image_a)
+    dst_page.insert_obj(image_b)
+    dst_page.gen_content()
+    
+    out_path = OutputDir / "image_bitmap.pdf"
+    dst_pdf.save(out_path)
+    
+    reopened_pdf = pdfium.PdfDocument(out_path)
+    reopened_page = reopened_pdf[0]
+    reopened_image = next( reopened_page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
+    assert reopened_image.get_filters() == ["FlateDecode"]
+
+
+def test_replace_image_with_jpeg():
+    
+    pdf = pdfium.PdfDocument(TestFiles.images)
+    page = pdf[0]
+    
+    images = list( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
+    matrices = [img.get_matrix() for img in images]
+    assert len(images) == 3
+    image_1 = images[0]
+    
+    image_1.load_jpeg(TestFiles.mona_lisa, pages=[page])
+    width, height = image_1.get_px_size()
+    assert matrices == [img.get_matrix() for img in images]
+    
+    # preserve the aspect ratio
+    # this strategy only works if the matrix was just used for size/position
+    for image, matrix in zip(images, matrices):
+        w_scale = matrix.a / width
+        h_scale = matrix.d / height
+        scale = min(w_scale, h_scale)
+        new_matrix = pdfium.PdfMatrix(width*scale, 0, 0, height*scale, matrix.e, matrix.f)
+        image.set_matrix(new_matrix)
+        assert image.get_matrix() == new_matrix
+    
+    page.gen_content()
+    output_path = OutputDir / "replace_images.pdf"
+    pdf.save(output_path)
+    assert output_path.exists()
+
+
+@pytest.mark.parametrize(
+    "render", [False, True]
+)
+def test_image_get_bitmap(render):
+    
+    pdf = pdfium.PdfDocument(TestFiles.images)
+    page = pdf[0]
+    
+    all_images = list( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
+    image = all_images[0]
+    
+    metadata = image.get_metadata()
+    assert metadata.width == 115
+    assert metadata.height == 48
+    assert round(metadata.horizontal_dpi) == 38
+    assert round(metadata.vertical_dpi) == 38
+    assert metadata.colorspace == pdfium_c.FPDF_COLORSPACE_DEVICEGRAY
+    assert metadata.marked_content_id == 1
+    assert metadata.bits_per_pixel == 1
+    
+    bitmap = image.get_bitmap(render=render)
+    assert isinstance(bitmap, pdfium.PdfBitmap)
+    
+    if render:
+        assert bitmap.format == pdfium_c.FPDFBitmap_BGRA
+        assert bitmap.n_channels == 4
+        assert bitmap.width == 216
+        assert bitmap.height == 90
+        assert bitmap.stride == 864
+        assert bitmap.rev_byteorder is False
+        output_path = OutputDir / "extract_rendered.png"
+    else:
+        # NOTE fails with pdfium >= 1e1e173 (6015), < b5bc2e9 (6029), which returns RGB
+        assert bitmap.format == pdfium_c.FPDFBitmap_Gray
+        assert bitmap.n_channels == 1
+        assert bitmap.width == 115
+        assert bitmap.height == 48
+        assert bitmap.stride == 116
+        assert bitmap.rev_byteorder is False
+        output_path = OutputDir / "extract.png"
+    
+    pil_image = bitmap.to_pil()
+    assert isinstance(pil_image, PIL.Image.Image)
+    pil_image.save(output_path)
+    assert output_path.exists()
+
+
+def test_remove_image():
+    
+    pdf = pdfium.PdfDocument(TestFiles.images)
+    page_1 = pdf[0]
+    
+    # TODO order images by position
+    images = list( page_1.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
+    assert len(images) == 3
+    
+    # drop an image
+    page_1.remove_obj(images[0])
+    
+    # delete and re-insert an image in place
+    page_1.remove_obj(images[1])
+    page_1.insert_obj(images[1])
+    
+    page_1.gen_content()
+    
+    output_path = OutputDir / "test_remove_objects.pdf"
+    pdf.save(output_path)
+    assert output_path.exists()
diff --git a/tests/test_rendering.py b/tests/test_rendering.py
index b4fb1b2f0..f8d75508c 100644
--- a/tests/test_rendering.py
+++ b/tests/test_rendering.py
@@ -1,2 +1,381 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
+
+import math
+import numpy
+import warnings
+import PIL.Image
+import pytest
+import pypdfium2 as pdfium
+import pypdfium2.raw as pdfium_c
+from .conftest import (
+    TestFiles,
+    PyVersion,
+    OutputDir,
+    ExpRenderPixels
+)
+
+# TODO assert that bitmap and info are consistent
+
+
+@pytest.fixture
+def sample_page():
+    pdf = pdfium.PdfDocument(TestFiles.render)
+    page = pdf[0]
+    yield page
+
+
+@pytest.fixture
+def multipage_doc():
+    pdf = pdfium.PdfDocument(TestFiles.multipage)
+    yield pdf
+
+
+def _check_pixels(pil_image, pixels):
+    for pos, value in pixels:
+        assert pil_image.getpixel(pos) == value
+
+
+@pytest.mark.parametrize(
+    ("name", "crop", "scale", "rotation"),
+    [   
+        ["01_r0",      (0,   0,   0,   0  ), 0.25, 0,   ],
+        ["02_r90",     (0,   0,   0,   0  ), 0.5,  90,  ],
+        ["03_r180",    (0,   0,   0,   0  ), 0.75, 180, ],
+        ["04_r270",    (0,   0,   0,   0  ), 1,    270, ],
+        ["05_cl",      (100, 0,   0,   0  ), 0.5,  0,   ],
+        ["06_cb",      (0,   100, 0,   0  ), 0.5,  0,   ],
+        ["07_cr",      (0,   0,   100, 0  ), 0.5,  0,   ],
+        ["08_ct",      (0,   0,   0,   100), 0.5,  0,   ],
+        ["09_r90_cb",  (0,   100,  0,  0  ), 0.5,  90,  ],
+        ["10_r180_cr", (0,   0,   100, 0  ), 0.5,  180, ],
+        ["11_r270_ct", (0,   0,   0,   100), 0.5,  270, ],
+    ]
+)
+def test_render_page_transform(sample_page, name, crop, scale, rotation):
+    pil_image = sample_page.render(
+        crop = crop,
+        scale = scale,
+        rotation = rotation,
+    ).to_pil()
+    pil_image.save(OutputDir / ("%s.png" % name))
+    assert pil_image.mode == "RGB"
+    
+    c_left, c_bottom, c_right, c_top = [math.ceil(c*scale) for c in crop]
+    w = math.ceil(595*scale)
+    h = math.ceil(842*scale)
+    if rotation in (90, 270):
+        w, h = h, w
+    
+    c_w = w - c_left - c_right
+    c_h = h - c_bottom - c_top
+    assert pil_image.size == (c_w, c_h)
+    
+    pixels = []
+    for (x, y), value in ExpRenderPixels:
+        x, y = round(x*scale), round(y*scale)
+        if rotation in (90, 270):
+            x, y = y, x
+        if rotation == 90:
+            x = w-1 - x
+        elif rotation == 180:
+            x = w-1 - x
+            y = h-1 - y
+        elif rotation == 270:
+            y = h-1 - y
+        x -= c_left
+        y -= c_top
+        if 0 <= x < c_w and 0 <= y < c_h:
+            pixels.append( ((x, y), value) )
+    
+    _check_pixels(pil_image, pixels)
+
+
+@pytest.mark.parametrize(
+    "rev_byteorder", [False, True]
+)
+def test_render_page_bgrx(rev_byteorder, sample_page):
+    pil_image = sample_page.render(
+        prefer_bgrx = True,
+        rev_byteorder = rev_byteorder,
+    ).to_pil()
+    assert pil_image.mode == "RGBX"
+    exp_pixels = [(pos, (*value, 255)) for pos, value in ExpRenderPixels]
+    _check_pixels(pil_image, exp_pixels)
+
+
+def test_render_page_alpha(sample_page):
+    
+    pixels = [
+        [(0,   0  ), (0,   0,   0,   0  )],
+        [(62,  66 ), (0,   0,   0,   186)],
+        [(150, 180), (129, 212, 26,  255)],
+        [(150, 390), (42,  96,  153, 255)],
+        [(150, 570), (128, 0,   128, 255)],
+    ]
+    kwargs = dict(
+        fill_color = (0, 0, 0, 0),
+    )
+    image = sample_page.render(**kwargs).to_pil()
+    image_rev = sample_page.render(**kwargs, rev_byteorder=True).to_pil()
+    
+    if PyVersion > (3, 6):
+        assert image == image_rev
+    assert image.mode == "RGBA"
+    assert image.size == (595, 842)
+    for pos, exp_value in pixels:
+        assert image.getpixel(pos) == exp_value
+    
+    image.save(OutputDir / "colored_alpha.png")
+
+
+def test_render_page_grey(sample_page):
+    kwargs = dict(
+        grayscale = True,
+        scale = 0.5,
+    )
+    image = sample_page.render(**kwargs).to_pil()
+    image_rev = sample_page.render(**kwargs, rev_byteorder=True).to_pil()
+    assert image == image_rev
+    assert image.size == (298, 421)
+    assert image.mode == "L"
+    image.save(OutputDir / "grayscale.png")
+
+
+@pytest.mark.parametrize(
+    "fill_color",
+    [
+        (255, 255, 255, 255),
+        (60,  70,  80,  100),
+        (255, 255, 255, 255),
+        (0,   255, 255, 255),
+        (255, 0,   255, 255),
+        (255, 255, 0,   255),
+    ]
+)
+def test_render_page_fill_color(fill_color, sample_page):
+    kwargs = dict(
+        fill_color = fill_color,
+        scale = 0.5,
+    )
+    image = sample_page.render(**kwargs).to_pil()
+    image_rev = sample_page.render(**kwargs, rev_byteorder=True).to_pil()
+    
+    if PyVersion > (3, 6):
+        assert image == image_rev
+    
+    bg_pixel = image.getpixel( (0, 0) )
+    if fill_color[3] == 255:
+        fill_color = fill_color[:-1]
+    assert image.size == (298, 421)
+    assert bg_pixel == fill_color
+
+
+@pytest.mark.parametrize("rev_byteorder", [False, True])
+def test_render_page_tonumpy(rev_byteorder, sample_page):
+    
+    bitmap = sample_page.render(
+        rev_byteorder = rev_byteorder,
+    )
+    info, array = bitmap.get_info(), bitmap.to_numpy()
+    assert isinstance(array, numpy.ndarray)
+    assert isinstance(info, pdfium.PdfBitmapInfo)
+    if rev_byteorder:
+        assert info.mode == "RGB"
+    else:
+        assert info.mode == "BGR"
+    
+    for (x, y), value in ExpRenderPixels:
+        if rev_byteorder:
+            assert tuple(array[y][x]) == value
+        else:
+            assert tuple(array[y][x]) == tuple(reversed(value))
+
+
+@pytest.mark.parametrize("mode", [None, "lcd", "print"])
+def test_render_page_optimization(sample_page, mode):
+    pil_image = sample_page.render(
+        optimize_mode = mode,
+        scale = 0.5,
+    ).to_pil()
+    assert isinstance(pil_image, PIL.Image.Image)
+
+
+def test_render_page_noantialias(sample_page):
+    pil_image = sample_page.render(
+        no_smoothtext  = True,
+        no_smoothimage = True,
+        no_smoothpath  = True,
+        scale = 0.5,
+    ).to_pil()
+    assert isinstance(pil_image, PIL.Image.Image)
+
+
+def test_render_pages_no_concurrency(multipage_doc):
+    for page in multipage_doc:
+        image = page.render(
+            scale = 0.5,
+            grayscale = True,
+        ).to_pil()
+        assert isinstance(image, PIL.Image.Image)
+
+
+@pytest.fixture
+def render_pdffile_topil(multipage_doc):
+    
+    renderer = multipage_doc.render(
+        pdfium.PdfBitmap.to_pil,
+        scale = 0.5,
+    )
+    imgs = []
+    
+    for image in renderer:
+        assert isinstance(image, PIL.Image.Image)
+        assert image.mode == "RGB"
+        imgs.append(image)
+    
+    assert len(imgs) == 3
+    yield imgs
+
+
+@pytest.fixture
+def render_pdffile_tonumpy(multipage_doc):
+    
+    renderer = multipage_doc.render(
+        pdfium.PdfBitmap.to_numpy,
+        scale = 0.5,
+        rev_byteorder = True,
+        pass_info = True,
+    )
+    imgs = []
+    
+    for array, info in renderer:
+        assert info.mode == "RGB"
+        assert isinstance(array, numpy.ndarray)
+        pil_image = PIL.Image.fromarray(array, mode=info.mode)
+        imgs.append(pil_image)
+    
+    # for i, img in enumerate(imgs):
+    #     img.save(OutputDir / ("numpy_%s.png" % i))
+    
+    assert len(imgs) == 3
+    yield imgs
+
+
+def test_render_pdffile(render_pdffile_topil, render_pdffile_tonumpy):
+    for a, b in zip(render_pdffile_topil, render_pdffile_tonumpy):
+        assert a == b
+
+
+def test_render_pdf_new():
+    
+    # two pages to actually reach the process pool and not just the single-page shortcut
+    pdf = pdfium.PdfDocument.new()
+    page_1 = pdf.new_page(50, 100)
+    page_2 = pdf.new_page(50, 100)
+    renderer = pdf.render(pdfium.PdfBitmap.to_pil)
+    bitmap_p1 = next(renderer)
+
+
+def test_render_pdfbuffer():
+    
+    buffer = open(TestFiles.multipage, "rb")
+    pdf = pdfium.PdfDocument(buffer)
+        
+    renderer = pdf.render(pdfium.PdfBitmap.to_pil)
+    bitmap_p1 = next(renderer)
+
+
+@pytest.mark.parametrize(
+    ("with_forms", "exp_color"),
+    [
+        (False, (255, 255, 255)),
+        (True, (0, 51, 113)),
+    ]
+)
+def test_render_form(with_forms, exp_color):
+    
+    pdf = pdfium.PdfDocument(TestFiles.forms)
+    if with_forms:
+        pdf.init_forms()
+    
+    if with_forms:
+        assert isinstance(pdf.formenv, pdfium.PdfFormEnv)
+    else:
+        assert pdf.formenv is None
+    
+    page = pdf[0]
+    image = page.render(
+        may_draw_forms = with_forms,
+    ).to_pil()
+    
+    assert image.getpixel( (190, 190) ) == exp_color
+    assert image.getpixel( (190, 430) ) == exp_color
+    assert image.getpixel( (190, 480) ) == exp_color
+
+
+def test_numpy_nocopy(sample_page):
+    bitmap = sample_page.render(scale=0.1)
+    array = bitmap.to_numpy()
+    assert (bitmap.width, bitmap.height) == (60, 85)
+    val_a, val_b = 255, 123
+    assert array[0][0][0] == val_a
+    bitmap.buffer[0] = val_b
+    assert array[0][0][0] == val_b
+    array[0][0][0] = val_a
+    assert bitmap.buffer[0] == val_a
+
+
+@pytest.mark.parametrize(
+    ("bitmap_format", "rev_byteorder", "is_referenced"),
+    [
+        (pdfium_c.FPDFBitmap_BGR,  False, False),
+        (pdfium_c.FPDFBitmap_BGR,  True,  False),
+        (pdfium_c.FPDFBitmap_BGRA, False, False),
+        (pdfium_c.FPDFBitmap_BGRA, True,  True),
+        (pdfium_c.FPDFBitmap_BGRx, False, False),
+        (pdfium_c.FPDFBitmap_BGRx, True,  True),
+        (pdfium_c.FPDFBitmap_Gray, False, True),
+    ]
+)
+def test_pil_nocopy_where_possible(bitmap_format, rev_byteorder, is_referenced, sample_page):
+    
+    bitmap = sample_page.render(
+        scale = 0.1,
+        rev_byteorder = rev_byteorder,
+        force_bitmap_format = bitmap_format,
+    )
+    pil_image = bitmap.to_pil()
+    assert pil_image.size == (60, 85)
+    
+    val_a, val_b = 255, 123
+    if bitmap.n_channels == 4:
+        pixel_a = (val_a, 255, 255, 255)
+        pixel_b = (val_b, 255, 255, 255)
+    elif bitmap.n_channels == 3:
+        pixel_a = (val_a, 255, 255)
+        pixel_b = (val_b, 255, 255)
+    elif bitmap.n_channels == 1:
+        pixel_a = val_a
+        pixel_b = val_b
+    else:
+        assert False
+    
+    assert pil_image.getpixel((0, 0)) == pixel_a
+    bitmap.buffer[0] = val_b
+    
+    if is_referenced:
+        
+        # changes to the buffer are reflected in the image
+        assert pil_image.getpixel((0, 0)) == pixel_b
+        
+        # changes to the image are reflected in the buffer, since we set `.readonly = False` on after image init
+        pil_image.putpixel((0, 0), pixel_a)
+        assert pil_image.getpixel((0, 0)) == pixel_a
+        assert bitmap.buffer[0] == val_a
+        
+    else:
+        if pil_image.getpixel((0, 0)) == pixel_b:
+            warnings.warn(f"PIL now references {bitmap.mode} mode bitmaps.")
+        else:
+            assert pil_image.getpixel((0, 0)) == pixel_a
diff --git a/tests/test_saving.py b/tests/test_saving.py
index cfea22008..36c95cc1e 100644
--- a/tests/test_saving.py
+++ b/tests/test_saving.py
@@ -4,10 +4,11 @@
 import io
 import pytest
 import pypdfium2 as pdfium
-from .conftest import TestResources
+import pypdfium2.raw as pdfium_c
+from .conftest import TestFiles, OutputDir
 
 
-def _get_saving_handler(version):
+def _new_pdf_saving_handler(version):
     
     pdf = pdfium.PdfDocument.new()
     size = (612, 792)
@@ -26,12 +27,12 @@ def _get_saving_handler(version):
     yield
 
 
-def _save_tofile(version, tmp_path, use_str):
+def _save_to_file(version, tmp_path, use_str):
     
-    handler = _get_saving_handler(version)
+    handler = _new_pdf_saving_handler(version)
     pdf, kwargs = next(handler)
     
-    path = tmp_path / "test_save_tofile.pdf"
+    path = tmp_path / "test_save_to_file.pdf"
     dest = str(path) if use_str else path
     
     pdf.save(dest, **kwargs)
@@ -44,20 +45,17 @@ def _save_tofile(version, tmp_path, use_str):
 
 parametrize_saving_version = pytest.mark.parametrize("version", [None, 14, 17])
 
-
-def test_save_to_strpath(tmp_path):
-    _save_tofile(15, tmp_path, use_str=True)
-
+def test_save_new_to_strpath(tmp_path):
+    _save_to_file(15, tmp_path, use_str=True)
 
 @parametrize_saving_version
-def test_save_to_path(version, tmp_path):
-    _save_tofile(version, tmp_path, use_str=False)
-
+def test_save_new_to_path(version, tmp_path):
+    _save_to_file(version, tmp_path, use_str=False)
 
 @parametrize_saving_version
-def test_save_tobuffer(version):
+def test_save_new_to_buffer(version):
     
-    handler = _get_saving_handler(version)
+    handler = _new_pdf_saving_handler(version)
     pdf, kwargs = next(handler)
     
     out_buffer = io.BytesIO()
@@ -69,23 +67,72 @@ def test_save_tobuffer(version):
     handler.send(saved_pdf)
 
 
-def test_save_deletion():
-    
+def test_save_tiled():
+
+    src_pdf = pdfium.PdfDocument(TestFiles.multipage)
+    new_pdf_raw = pdfium_c.FPDF_ImportNPagesToOne(
+        src_pdf.raw,
+        595, 842,
+        2, 2,
+    )
+
+    new_pdf = pdfium.PdfDocument(new_pdf_raw)
+    assert len(new_pdf) == 1
+    page = new_pdf[0]
+    assert page.get_size() == (595, 842)
+
+    output_file = OutputDir / "tiling.pdf"
+    new_pdf.save(output_file)
+    assert output_file.exists()
+
+
+def test_save_with_deletion():
+
     # Regression test for BUG(96):
     # Make sure page deletions take effect when saving a document
-    
-    pdf = pdfium.PdfDocument(TestResources.multipage)
+
+    pdf = pdfium.PdfDocument(TestFiles.multipage)
     assert len(pdf) == 3
     pdf.del_page(0)
     assert len(pdf) == 2
-    
+
     buffer = io.BytesIO()
     pdf.save(buffer)
     buffer.seek(0)
-    
+
     saved_pdf = pdfium.PdfDocument(buffer, autoclose=True)
     assert len(saved_pdf) == 2
-    
+
     page = saved_pdf[0]
     textpage = page.get_textpage()
     assert textpage.get_text_bounded() == "Page\r\n2"
+
+
+def test_save_and_check_id():  # includes deletion, versioned save, and raw data start/end check
+
+    pdf = pdfium.PdfDocument(TestFiles.multipage)
+    pre_id_p = pdf.get_identifier(pdfium_c.FILEIDTYPE_PERMANENT)
+    pre_id_c = pdf.get_identifier(pdfium_c.FILEIDTYPE_CHANGING)
+    assert isinstance(pre_id_p, bytes)
+    pdf.del_page(1)
+
+    buffer = io.BytesIO()
+    pdf.save(buffer, version=17)
+
+    buffer.seek(0)
+    data = buffer.read()
+    buffer.seek(0)
+
+    exp_start = b"%PDF-1.7"
+    exp_end = b"%EOF\r\n"
+    assert data[:len(exp_start)] == exp_start
+    assert data[-len(exp_end):] == exp_end
+
+    reopened_pdf = pdfium.PdfDocument(buffer, autoclose=True)
+    assert len(reopened_pdf) == 2
+    assert reopened_pdf.get_version() == 17
+
+    post_id_p = reopened_pdf.get_identifier(pdfium_c.FILEIDTYPE_PERMANENT)
+    post_id_c = reopened_pdf.get_identifier(pdfium_c.FILEIDTYPE_CHANGING)
+    assert pre_id_p == post_id_p
+    assert pre_id_c != post_id_c
diff --git a/tests/test_textpage.py b/tests/test_textpage.py
index 4bacf49b5..eeacc9e33 100644
--- a/tests/test_textpage.py
+++ b/tests/test_textpage.py
@@ -1,8 +1,143 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
+import re
+import pytest
 import pypdfium2 as pdfium
-from .conftest import TestResources
+from .conftest import TestFiles
+
+
+@pytest.fixture
+def text_pdf():
+    pdf = pdfium.PdfDocument(TestFiles.text)
+    yield pdf
+
+
+@pytest.fixture
+def textpage(text_pdf):
+    page = text_pdf[0]
+    textpage = page.get_textpage()
+    assert isinstance(textpage, pdfium.PdfTextPage)
+    yield textpage
+
+
+def test_gettext(textpage):
+    text_a = textpage.get_text_bounded()
+    text_b = textpage.get_text_range()
+    assert text_a == text_b
+    assert len(text_a) == 438
+    exp_start = "Lorem ipsum dolor sit amet,\r\n"
+    exp_end = "\r\nofficia deserunt mollit anim id est laborum."
+    assert text_a.startswith(exp_start)
+    assert text_a.endswith(exp_end)
+    text_start = textpage.get_text_range(0, len(exp_start))
+    text_end_a = textpage.get_text_range(len(text_a)-len(exp_end))  # count=-1
+    text_end_b = textpage.get_text_range(len(text_a)-len(exp_end), len(exp_end))
+    assert text_start == exp_start
+    assert text_end_a == text_end_b == exp_end
+
+
+@pytest.mark.parametrize("loose", [False, True])
+def test_getcharbox(textpage, loose):
+    for index in range(textpage.count_chars()):
+        box = textpage.get_charbox(index, loose=loose)
+        assert all( isinstance(val, (int, float)) for val in box )
+        assert box[0] <= box[2] and box[1] <= box[3]
+
+
+def test_getrectboxes(textpage):
+    n_rects = textpage.count_rects()
+    rects = [textpage.get_rect(i) for i in range(n_rects)]
+    assert len(rects) == 10
+
+    first_rect = rects[0]
+    assert pytest.approx(first_rect, abs=1) == (58, 767, 258, 782)
+    first_text = textpage.get_text_bounded(*first_rect)
+    assert first_text == "Lorem ipsum dolor sit amet,"
+    assert textpage.get_text_range(0, len(first_text)) == first_text
+
+    for rect in rects:
+        assert len(rect) == 4
+        assert 56 < rect[0] < 59
+        text = textpage.get_text_bounded(*rect)
+        assert isinstance(text, str)
+        assert len(text) <= 66
+
+    assert text == "officia deserunt mollit anim id est laborum."
+    assert textpage.get_text_range(textpage.count_chars()-len(text))  # count=-1
+
+
+def _get_rects(textpage, search_result):
+    # TODO add helper?
+    if search_result is None:
+        return []
+    c_index, c_count = search_result
+    r_index = textpage.count_rects(0, c_index) - 1
+    r_count = textpage.count_rects(c_index, c_count)
+    textpage.count_rects()
+    rects = [textpage.get_rect(i) for i in range(r_index, r_index+r_count)]
+    return rects
+
+
+def test_search_text(textpage):
+    searcher = textpage.search("labor")
+
+    occ_1a = searcher.get_next()
+    occ_2a = searcher.get_next()
+    occ_3a = searcher.get_next()
+    occ_4x = searcher.get_next()
+    occ_2b = searcher.get_prev()
+    occ_1b = searcher.get_prev()
+
+    assert occ_1a == (89, 5)
+    assert occ_2a == (181, 5)
+    assert occ_3a == (430, 5)
+    assert occ_4x is None
+    assert occ_1a == occ_1b and occ_2a == occ_2b
+
+    occs = (occ_1a, occ_2a, occ_3a)
+    exp_rectlists = [
+        [ (57, 675, 511, 690) ],
+        [ (58, 638, 537, 653) ],
+        [ (58, 549, 367, 561) ],
+    ]
+
+    for occ, exp_rects in zip(occs, exp_rectlists):
+        rects = _get_rects(textpage, occ)
+        assert [pytest.approx(r, abs=0.5) for r in rects] == exp_rects
+
+
+def test_get_index(textpage):
+
+    x, y = (60, textpage.page.get_height()-66)
+
+    index = textpage.get_index(x, y, 5, 5)
+    assert index < textpage.count_chars() and index == 0
+
+    charbox = textpage.get_charbox(index)
+    char = textpage.get_text_bounded(*charbox)
+    assert char == "L"
+
+
+def test_textpage_empty():
+    pdf = pdfium.PdfDocument(TestFiles.empty)
+    page = pdf[0]
+    textpage = page.get_textpage()
+
+    assert textpage.get_text_bounded() == ""
+    assert textpage.get_text_range() == ""
+    assert textpage.count_chars() == 0
+    assert textpage.count_rects() == 0
+    assert textpage.get_index(0, 0, 0, 0) is None
+
+    searcher = textpage.search("a")
+    assert searcher.get_next() is None
+
+    with pytest.raises(pdfium.PdfiumError, match=re.escape("Failed to get charbox.")):
+        textpage.get_charbox(0)
+    with pytest.raises(ValueError, match=re.escape("Text length must be greater than 0.")):
+        textpage.search("")
+
 
 
 def test_get_text_bounded_defaults_with_rotation():
@@ -10,7 +145,7 @@ def test_get_text_bounded_defaults_with_rotation():
     # Regression test for BUG(149):
     # Make sure defaults use native PDF coordinates instead of normalized page size
     
-    pdf = pdfium.PdfDocument(TestResources.text)
+    pdf = pdfium.PdfDocument(TestFiles.text)
     page = pdf[0]
     page.set_rotation(90)
     textpage = page.get_textpage()
diff --git a/tests/test_toc.py b/tests/test_toc.py
index a732822a3..851475910 100644
--- a/tests/test_toc.py
+++ b/tests/test_toc.py
@@ -5,7 +5,7 @@
 import logging
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
-from .conftest import TestResources
+from .conftest import TestFiles
 
 
 def _compare_bookmark(bm, **kwargs):
@@ -25,7 +25,7 @@ def _compare_bookmark(bm, **kwargs):
 
 def test_gettoc():
     
-    pdf = pdfium.PdfDocument(TestResources.toc)
+    pdf = pdfium.PdfDocument(TestFiles.toc)
     toc = pdf.get_toc()
     
     # check first bookmark
@@ -58,7 +58,7 @@ def test_gettoc():
 
 def test_gettoc_circular(caplog):
     
-    pdf = pdfium.PdfDocument(TestResources.toc_circular)
+    pdf = pdfium.PdfDocument(TestFiles.toc_circular)
     toc = pdf.get_toc()
     
     _compare_bookmark(
diff --git a/tests_old/__init__.py b/tests_old/__init__.py
deleted file mode 100644
index b4fb1b2f0..000000000
--- a/tests_old/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
-# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
diff --git a/tests_old/conftest.py b/tests_old/conftest.py
deleted file mode 100644
index 03eb520a6..000000000
--- a/tests_old/conftest.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
-# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
-
-import sys
-from pathlib import Path
-# import pypdfium2.__main__ as pdfium_cli
-
-# if tests/ and tests_old/ are run together as usual, this would initialize logging twice
-# pdfium_cli.setup_logging()
-
-PyVersion = (sys.version_info.major, sys.version_info.minor)
-
-TestDir     = Path(__file__).absolute().parent
-ProjectDir  = TestDir.parent
-ResourceDir = TestDir / "resources"
-OutputDir   = TestDir / "output"
-
-class TestFiles:
-    render        = ResourceDir / "render.pdf"
-    encrypted     = ResourceDir / "encrypted.pdf"
-    multipage     = ResourceDir / "multipage.pdf"
-    toc           = ResourceDir / "toc.pdf"
-    toc_viewmodes = ResourceDir / "toc_viewmodes.pdf"
-    toc_maxdepth  = ResourceDir / "toc_maxdepth.pdf"
-    toc_circular  = ResourceDir / "toc_circular.pdf"
-    box_fallback  = ResourceDir / "box_fallback.pdf"
-    text          = ResourceDir / "text.pdf"
-    empty         = ResourceDir / "empty.pdf"
-    images        = ResourceDir / "images.pdf"
-    form          = ResourceDir / "forms.pdf"
-    attachments   = ResourceDir / "attachments.pdf"
-    mona_lisa     = ResourceDir / "mona_lisa.jpg"
-
-
-ExpRenderPixels = (
-    ( (0,   0  ), (255, 255, 255) ),
-    ( (150, 180), (129, 212, 26 ) ),
-    ( (150, 390), (42,  96,  153) ),
-    ( (150, 570), (128, 0,   128) ),
-)
-
-
-def iterate_testfiles(skip_encrypted=True):
-    encrypted = (TestFiles.encrypted, )
-    for attr_name in dir(TestFiles):
-        if attr_name.startswith("_"):
-            continue
-        member = getattr(TestFiles, attr_name)
-        if skip_encrypted and member in encrypted:
-            continue
-        yield member
-
-
-def get_members(cls):
-    members = []
-    for attr in dir(cls):
-        if attr.startswith("_"):
-            continue
-        members.append( getattr(cls, attr) )
-    return members
-
-
-def test_testpaths():
-    for dirpath in (TestDir, ProjectDir, ResourceDir, OutputDir):
-        assert dirpath.is_dir()
-    for filepath in iterate_testfiles(False):
-        assert filepath.is_file()
diff --git a/tests_old/output/.gitkeep b/tests_old/output/.gitkeep
deleted file mode 100644
index 8d1c8b69c..000000000
--- a/tests_old/output/.gitkeep
+++ /dev/null
@@ -1 +0,0 @@
- 
diff --git a/tests_old/resources b/tests_old/resources
deleted file mode 120000
index 23a8f00b2..000000000
--- a/tests_old/resources
+++ /dev/null
@@ -1 +0,0 @@
-../tests/resources/
\ No newline at end of file
diff --git a/tests_old/test_page.py b/tests_old/test_page.py
deleted file mode 100644
index 9bde27c0a..000000000
--- a/tests_old/test_page.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
-# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
-
-import pytest
-import pypdfium2 as pdfium
-# import pypdfium2.raw as pdfium_c
-from .conftest import TestFiles
-
-
-def test_boxes():
-    
-    pdf = pdfium.PdfDocument(TestFiles.render)
-    index = 0
-    page = pdf[index]
-    assert page.get_size() == pdf.get_page_size(index) == (595, 842)
-    assert page.get_mediabox() == (0, 0, 595, 842)
-    assert isinstance(page, pdfium.PdfPage)
-    
-    test_cases = [
-        ("media", (0,  0,  612, 792)),
-        ("media", (0,  0,  595, 842)),
-        ("crop",  (10, 10, 585, 832)),
-        ("bleed", (20, 20, 575, 822)),
-        ("trim",  (30, 30, 565, 812)),
-        ("art",   (40, 40, 555, 802)),
-    ]
-    
-    for meth_name, exp_box in test_cases:
-        getattr(page, "set_%sbox" % meth_name)(*exp_box)
-        box = getattr(page, "get_%sbox" % meth_name)()
-        assert pytest.approx(box) == exp_box
-
-
-def test_mediabox_fallback():
-    pdf = pdfium.PdfDocument(TestFiles.box_fallback)
-    page = pdf[0]
-    assert page.get_mediabox() == (0, 0, 612, 792)
-
-
-def test_rotation():
-    pdf = pdfium.PdfDocument.new()
-    page = pdf.new_page(500, 800)
-    for r in (90, 180, 270, 0):
-        page.set_rotation(r)
-        assert page.get_rotation() == r
-
-
-def test_page_labels():
-    # incidentally, it happens that this TOC test file also has page labels
-    pdf = pdfium.PdfDocument(TestFiles.toc_viewmodes)
-    exp_labels = ["i", "ii", "appendix-C", "appendix-D", "appendix-E", "appendix-F", "appendix-G", "appendix-H"]
-    assert exp_labels == [pdf.get_page_label(i) for i in range(len(pdf))]
-
-
-# # disabled because flattening takes no effect
-# def test_flatten():
-    
-#     pdf = pdfium.PdfDocument(TestFiles.form)
-#     page = pdf[0]
-    
-#     rc = page._flatten()
-#     assert rc == pdfium_c.FLATTEN_SUCCESS
-    
-#     # pdf.save(OutputDir / "flattened.pdf")
diff --git a/tests_old/test_pageobject.py b/tests_old/test_pageobject.py
deleted file mode 100644
index b7e8451f3..000000000
--- a/tests_old/test_pageobject.py
+++ /dev/null
@@ -1,266 +0,0 @@
-# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
-# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
-
-import io
-import re
-import pytest
-import PIL.Image
-import pypdfium2 as pdfium
-import pypdfium2.raw as pdfium_c
-from .conftest import TestFiles, OutputDir
-
-
-def compare_n2(data, exp_data):
-    assert len(data) == len(exp_data)
-    for d, exp_d in zip(data, exp_data):
-        assert pytest.approx(d, abs=1) == exp_d
-
-
-def test_image_objects():
-    pdf = pdfium.PdfDocument(TestFiles.images)
-    page = pdf[0]
-    assert page.pdf is pdf
-    
-    images = list( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
-    assert len(images) == 3
-    
-    img_0 = images[0]
-    assert isinstance(img_0, pdfium.PdfObject)
-    assert type(img_0) is pdfium.PdfImage
-    assert img_0.type == pdfium_c.FPDF_PAGEOBJ_IMAGE
-    assert isinstance(img_0.raw, pdfium_c.FPDF_PAGEOBJECT)
-    assert img_0.level == 0
-    assert img_0.page is page
-    assert img_0.pdf is pdf
-    
-    positions = [img.get_bounds() for img in images]
-    exp_positions = [
-        (133, 459, 350, 550),
-        (48, 652, 163, 700),
-        (204, 204, 577, 360),
-    ]
-    compare_n2(positions, exp_positions)
-    
-    compare_n2(
-        img_0.get_quad_points(),
-        ((132.7, 459.2), (349.5, 459.2), (349.5, 549.7), (132.7, 549.7))
-    )
-
-
-def test_misc_objects():
-    
-    pdf = pdfium.PdfDocument(TestFiles.render)
-    page = pdf[0]
-    assert page.pdf is pdf
-    
-    for obj in page.get_objects():
-        assert type(obj) is pdfium.PdfObject
-        assert isinstance(obj.raw, pdfium_c.FPDF_PAGEOBJECT)
-        assert obj.type in (pdfium_c.FPDF_PAGEOBJ_TEXT, pdfium_c.FPDF_PAGEOBJ_PATH)
-        assert obj.level == 0
-        assert obj.page is page
-        assert obj.pdf is pdf
-        pos = obj.get_bounds()
-        assert len(pos) == 4
-    
-    text_obj = next(obj for obj in page.get_objects() if obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT)
-    path_obj = next(obj for obj in page.get_objects() if obj.type == pdfium_c.FPDF_PAGEOBJ_PATH)
-    
-    compare_n2(
-        text_obj.get_quad_points(),
-        ((57.3, 767.4), (124.2, 767.4), (124.2, 780.9), (57.3, 780.9))
-    )
-    
-    with pytest.raises(RuntimeError, match=re.escape("Quad points only supported for image and text objects.")):
-        path_obj.get_quad_points()
-
-
-def test_new_image_from_jpeg():
-    
-    pdf = pdfium.PdfDocument.new()
-    page = pdf.new_page(240, 120)
-    
-    image_a = pdfium.PdfImage.new(pdf)
-    buffer = open(TestFiles.mona_lisa, "rb")
-    image_a.load_jpeg(buffer, autoclose=True)
-    width, height = image_a.get_px_size()
-    page.insert_obj(image_a)
-    
-    assert len(pdf._data_holder) == 1
-    assert pdf._data_closer == [buffer]
-    
-    assert image_a.get_matrix() == pdfium.PdfMatrix()
-    image_a.set_matrix( pdfium.PdfMatrix().scale(width, height) )
-    assert image_a.get_matrix() == pdfium.PdfMatrix(width, 0, 0, height, 0, 0)
-    
-    pil_image_1 = PIL.Image.open(TestFiles.mona_lisa)
-    bitmap = image_a.get_bitmap()
-    pil_image_2 = bitmap.to_pil()
-    assert (120, 120) == pil_image_1.size == pil_image_2.size == (bitmap.width, bitmap.height)
-    assert "RGB" == pil_image_1.mode == pil_image_2.mode
-    
-    in_data = TestFiles.mona_lisa.read_bytes()
-    out_buffer = io.BytesIO()
-    image_a.extract(out_buffer)
-    out_buffer.seek(0)
-    out_data = out_buffer.read()
-    assert in_data == out_data
-    
-    metadata = image_a.get_metadata()
-    assert isinstance(metadata, pdfium_c.FPDF_IMAGEOBJ_METADATA)
-    assert metadata.bits_per_pixel == 24  # 3 channels, 8 bits each
-    assert metadata.colorspace == pdfium_c.FPDF_COLORSPACE_DEVICERGB
-    assert metadata.height == height == 120
-    assert metadata.width == width == 120
-    assert metadata.horizontal_dpi == 72
-    assert metadata.vertical_dpi == 72
-    
-    image_b = pdfium.PdfImage.new(pdf)
-    with open(TestFiles.mona_lisa, "rb") as buffer:
-        image_b.load_jpeg(buffer, inline=True, autoclose=False)
-    
-    assert image_b.get_matrix() == pdfium.PdfMatrix()
-    image_b.set_matrix( pdfium.PdfMatrix().scale(width, height).translate(width, 0) )
-    image_b.get_matrix() == pdfium.PdfMatrix(width, 0, 0, height, width, 0)
-    page.insert_obj(image_b)
-    
-    page.gen_content()
-    out_path = OutputDir / "image_jpeg.pdf"
-    pdf.save(out_path)
-    assert out_path.exists()
-    
-    page._finalizer()
-    pdf._finalizer()
-    assert buffer.closed is True
-
-
-def test_new_image_from_bitmap():
-    
-    src_pdf = pdfium.PdfDocument(TestFiles.render)
-    src_page = src_pdf[0]
-    dst_pdf = pdfium.PdfDocument.new()
-    image_a = pdfium.PdfImage.new(dst_pdf)
-    
-    bitmap = src_page.render()
-    w, h = bitmap.width, bitmap.height
-    image_a.set_bitmap(bitmap)
-    image_a.set_matrix( pdfium.PdfMatrix().scale(w, h) )
-    
-    pil_image = PIL.Image.open(TestFiles.mona_lisa)
-    bitmap = pdfium.PdfBitmap.from_pil(pil_image)
-    image_b = pdfium.PdfImage.new(dst_pdf)
-    image_b.set_matrix( pdfium.PdfMatrix().scale(bitmap.width, bitmap.height) )
-    image_b.set_bitmap(bitmap)
-    
-    dst_page = dst_pdf.new_page(w, h)
-    dst_page.insert_obj(image_a)
-    dst_page.insert_obj(image_b)
-    dst_page.gen_content()
-    
-    out_path = OutputDir / "image_bitmap.pdf"
-    dst_pdf.save(out_path)
-    
-    reopened_pdf = pdfium.PdfDocument(out_path)
-    reopened_page = reopened_pdf[0]
-    reopened_image = next( reopened_page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
-    assert reopened_image.get_filters() == ["FlateDecode"]
-
-
-def test_replace_image_with_jpeg():
-    
-    pdf = pdfium.PdfDocument(TestFiles.images)
-    page = pdf[0]
-    
-    images = list( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
-    matrices = [img.get_matrix() for img in images]
-    assert len(images) == 3
-    image_1 = images[0]
-    
-    image_1.load_jpeg(TestFiles.mona_lisa, pages=[page])
-    width, height = image_1.get_px_size()
-    assert matrices == [img.get_matrix() for img in images]
-    
-    # preserve the aspect ratio
-    # this strategy only works if the matrix was just used for size/position
-    for image, matrix in zip(images, matrices):
-        w_scale = matrix.a / width
-        h_scale = matrix.d / height
-        scale = min(w_scale, h_scale)
-        new_matrix = pdfium.PdfMatrix(width*scale, 0, 0, height*scale, matrix.e, matrix.f)
-        image.set_matrix(new_matrix)
-        assert image.get_matrix() == new_matrix
-    
-    page.gen_content()
-    output_path = OutputDir / "replace_images.pdf"
-    pdf.save(output_path)
-    assert output_path.exists()
-
-
-@pytest.mark.parametrize(
-    "render", [False, True]
-)
-def test_image_get_bitmap(render):
-    
-    pdf = pdfium.PdfDocument(TestFiles.images)
-    page = pdf[0]
-    
-    all_images = list( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
-    image = all_images[0]
-    
-    metadata = image.get_metadata()
-    assert metadata.width == 115
-    assert metadata.height == 48
-    assert round(metadata.horizontal_dpi) == 38
-    assert round(metadata.vertical_dpi) == 38
-    assert metadata.colorspace == pdfium_c.FPDF_COLORSPACE_DEVICEGRAY
-    assert metadata.marked_content_id == 1
-    assert metadata.bits_per_pixel == 1
-    
-    bitmap = image.get_bitmap(render=render)
-    assert isinstance(bitmap, pdfium.PdfBitmap)
-    
-    if render:
-        assert bitmap.format == pdfium_c.FPDFBitmap_BGRA
-        assert bitmap.n_channels == 4
-        assert bitmap.width == 216
-        assert bitmap.height == 90
-        assert bitmap.stride == 864
-        assert bitmap.rev_byteorder is False
-        output_path = OutputDir / "extract_rendered.png"
-    else:
-        # NOTE fails with pdfium >= 1e1e173 (6015), < b5bc2e9 (6029), which returns RGB
-        assert bitmap.format == pdfium_c.FPDFBitmap_Gray
-        assert bitmap.n_channels == 1
-        assert bitmap.width == 115
-        assert bitmap.height == 48
-        assert bitmap.stride == 116
-        assert bitmap.rev_byteorder is False
-        output_path = OutputDir / "extract.png"
-    
-    pil_image = bitmap.to_pil()
-    assert isinstance(pil_image, PIL.Image.Image)
-    pil_image.save(output_path)
-    assert output_path.exists()
-
-
-def test_remove_image():
-    
-    pdf = pdfium.PdfDocument(TestFiles.images)
-    page_1 = pdf[0]
-    
-    # TODO order images by position
-    images = list( page_1.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
-    assert len(images) == 3
-    
-    # drop an image
-    page_1.remove_obj(images[0])
-    
-    # delete and re-insert an image in place
-    page_1.remove_obj(images[1])
-    page_1.insert_obj(images[1])
-    
-    page_1.gen_content()
-    
-    output_path = OutputDir / "test_remove_objects.pdf"
-    pdf.save(output_path)
-    assert output_path.exists()
diff --git a/tests_old/test_renderer.py b/tests_old/test_renderer.py
deleted file mode 100644
index b5c79c347..000000000
--- a/tests_old/test_renderer.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
-# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
-
-import math
-import numpy
-import warnings
-import PIL.Image
-import pytest
-import pypdfium2 as pdfium
-import pypdfium2.raw as pdfium_c
-from .conftest import (
-    TestFiles,
-    PyVersion,
-    OutputDir,
-    ExpRenderPixels
-)
-
-# TODO assert that bitmap and info are consistent
-
-
-@pytest.fixture
-def sample_page():
-    pdf = pdfium.PdfDocument(TestFiles.render)
-    page = pdf[0]
-    yield page
-
-
-@pytest.fixture
-def multipage_doc():
-    pdf = pdfium.PdfDocument(TestFiles.multipage)
-    yield pdf
-
-
-def _check_pixels(pil_image, pixels):
-    for pos, value in pixels:
-        assert pil_image.getpixel(pos) == value
-
-
-@pytest.mark.parametrize(
-    ("name", "crop", "scale", "rotation"),
-    [   
-        ["01_r0",      (0,   0,   0,   0  ), 0.25, 0,   ],
-        ["02_r90",     (0,   0,   0,   0  ), 0.5,  90,  ],
-        ["03_r180",    (0,   0,   0,   0  ), 0.75, 180, ],
-        ["04_r270",    (0,   0,   0,   0  ), 1,    270, ],
-        ["05_cl",      (100, 0,   0,   0  ), 0.5,  0,   ],
-        ["06_cb",      (0,   100, 0,   0  ), 0.5,  0,   ],
-        ["07_cr",      (0,   0,   100, 0  ), 0.5,  0,   ],
-        ["08_ct",      (0,   0,   0,   100), 0.5,  0,   ],
-        ["09_r90_cb",  (0,   100,  0,  0  ), 0.5,  90,  ],
-        ["10_r180_cr", (0,   0,   100, 0  ), 0.5,  180, ],
-        ["11_r270_ct", (0,   0,   0,   100), 0.5,  270, ],
-    ]
-)
-def test_render_page_transform(sample_page, name, crop, scale, rotation):
-    pil_image = sample_page.render(
-        crop = crop,
-        scale = scale,
-        rotation = rotation,
-    ).to_pil()
-    pil_image.save(OutputDir / ("%s.png" % name))
-    assert pil_image.mode == "RGB"
-    
-    c_left, c_bottom, c_right, c_top = [math.ceil(c*scale) for c in crop]
-    w = math.ceil(595*scale)
-    h = math.ceil(842*scale)
-    if rotation in (90, 270):
-        w, h = h, w
-    
-    c_w = w - c_left - c_right
-    c_h = h - c_bottom - c_top
-    assert pil_image.size == (c_w, c_h)
-    
-    pixels = []
-    for (x, y), value in ExpRenderPixels:
-        x, y = round(x*scale), round(y*scale)
-        if rotation in (90, 270):
-            x, y = y, x
-        if rotation == 90:
-            x = w-1 - x
-        elif rotation == 180:
-            x = w-1 - x
-            y = h-1 - y
-        elif rotation == 270:
-            y = h-1 - y
-        x -= c_left
-        y -= c_top
-        if 0 <= x < c_w and 0 <= y < c_h:
-            pixels.append( ((x, y), value) )
-    
-    _check_pixels(pil_image, pixels)
-
-
-@pytest.mark.parametrize(
-    "rev_byteorder", [False, True]
-)
-def test_render_page_bgrx(rev_byteorder, sample_page):
-    pil_image = sample_page.render(
-        prefer_bgrx = True,
-        rev_byteorder = rev_byteorder,
-    ).to_pil()
-    assert pil_image.mode == "RGBX"
-    exp_pixels = [(pos, (*value, 255)) for pos, value in ExpRenderPixels]
-    _check_pixels(pil_image, exp_pixels)
-
-
-def test_render_page_alpha(sample_page):
-    
-    pixels = [
-        [(0,   0  ), (0,   0,   0,   0  )],
-        [(62,  66 ), (0,   0,   0,   186)],
-        [(150, 180), (129, 212, 26,  255)],
-        [(150, 390), (42,  96,  153, 255)],
-        [(150, 570), (128, 0,   128, 255)],
-    ]
-    kwargs = dict(
-        fill_color = (0, 0, 0, 0),
-    )
-    image = sample_page.render(**kwargs).to_pil()
-    image_rev = sample_page.render(**kwargs, rev_byteorder=True).to_pil()
-    
-    if PyVersion > (3, 6):
-        assert image == image_rev
-    assert image.mode == "RGBA"
-    assert image.size == (595, 842)
-    for pos, exp_value in pixels:
-        assert image.getpixel(pos) == exp_value
-    
-    image.save(OutputDir / "colored_alpha.png")
-
-
-def test_render_page_grey(sample_page):
-    kwargs = dict(
-        grayscale = True,
-        scale = 0.5,
-    )
-    image = sample_page.render(**kwargs).to_pil()
-    image_rev = sample_page.render(**kwargs, rev_byteorder=True).to_pil()
-    assert image == image_rev
-    assert image.size == (298, 421)
-    assert image.mode == "L"
-    image.save(OutputDir / "grayscale.png")
-
-
-@pytest.mark.parametrize(
-    "fill_color",
-    [
-        (255, 255, 255, 255),
-        (60,  70,  80,  100),
-        (255, 255, 255, 255),
-        (0,   255, 255, 255),
-        (255, 0,   255, 255),
-        (255, 255, 0,   255),
-    ]
-)
-def test_render_page_fill_color(fill_color, sample_page):
-    kwargs = dict(
-        fill_color = fill_color,
-        scale = 0.5,
-    )
-    image = sample_page.render(**kwargs).to_pil()
-    image_rev = sample_page.render(**kwargs, rev_byteorder=True).to_pil()
-    
-    if PyVersion > (3, 6):
-        assert image == image_rev
-    
-    bg_pixel = image.getpixel( (0, 0) )
-    if fill_color[3] == 255:
-        fill_color = fill_color[:-1]
-    assert image.size == (298, 421)
-    assert bg_pixel == fill_color
-
-
-@pytest.mark.parametrize("rev_byteorder", [False, True])
-def test_render_page_tonumpy(rev_byteorder, sample_page):
-    
-    bitmap = sample_page.render(
-        rev_byteorder = rev_byteorder,
-    )
-    info, array = bitmap.get_info(), bitmap.to_numpy()
-    assert isinstance(array, numpy.ndarray)
-    assert isinstance(info, pdfium.PdfBitmapInfo)
-    if rev_byteorder:
-        assert info.mode == "RGB"
-    else:
-        assert info.mode == "BGR"
-    
-    for (x, y), value in ExpRenderPixels:
-        if rev_byteorder:
-            assert tuple(array[y][x]) == value
-        else:
-            assert tuple(array[y][x]) == tuple(reversed(value))
-
-
-@pytest.mark.parametrize("mode", [None, "lcd", "print"])
-def test_render_page_optimization(sample_page, mode):
-    pil_image = sample_page.render(
-        optimize_mode = mode,
-        scale = 0.5,
-    ).to_pil()
-    assert isinstance(pil_image, PIL.Image.Image)
-
-
-def test_render_page_noantialias(sample_page):
-    pil_image = sample_page.render(
-        no_smoothtext  = True,
-        no_smoothimage = True,
-        no_smoothpath  = True,
-        scale = 0.5,
-    ).to_pil()
-    assert isinstance(pil_image, PIL.Image.Image)
-
-
-def test_render_pages_no_concurrency(multipage_doc):
-    for page in multipage_doc:
-        image = page.render(
-            scale = 0.5,
-            grayscale = True,
-        ).to_pil()
-        assert isinstance(image, PIL.Image.Image)
-
-
-@pytest.fixture
-def render_pdffile_topil(multipage_doc):
-    
-    renderer = multipage_doc.render(
-        pdfium.PdfBitmap.to_pil,
-        scale = 0.5,
-    )
-    imgs = []
-    
-    for image in renderer:
-        assert isinstance(image, PIL.Image.Image)
-        assert image.mode == "RGB"
-        imgs.append(image)
-    
-    assert len(imgs) == 3
-    yield imgs
-
-
-@pytest.fixture
-def render_pdffile_tonumpy(multipage_doc):
-    
-    renderer = multipage_doc.render(
-        pdfium.PdfBitmap.to_numpy,
-        scale = 0.5,
-        rev_byteorder = True,
-        pass_info = True,
-    )
-    imgs = []
-    
-    for array, info in renderer:
-        assert info.mode == "RGB"
-        assert isinstance(array, numpy.ndarray)
-        pil_image = PIL.Image.fromarray(array, mode=info.mode)
-        imgs.append(pil_image)
-    
-    # for i, img in enumerate(imgs):
-    #     img.save(OutputDir / ("numpy_%s.png" % i))
-    
-    assert len(imgs) == 3
-    yield imgs
-
-
-def test_render_pdffile(render_pdffile_topil, render_pdffile_tonumpy):
-    for a, b in zip(render_pdffile_topil, render_pdffile_tonumpy):
-        assert a == b
-
-
-def test_render_pdf_new():
-    
-    # two pages to actually reach the process pool and not just the single-page shortcut
-    pdf = pdfium.PdfDocument.new()
-    page_1 = pdf.new_page(50, 100)
-    page_2 = pdf.new_page(50, 100)
-    renderer = pdf.render(pdfium.PdfBitmap.to_pil)
-    bitmap_p1 = next(renderer)
-
-
-def test_render_pdfbuffer():
-    
-    buffer = open(TestFiles.multipage, "rb")
-    pdf = pdfium.PdfDocument(buffer)
-        
-    renderer = pdf.render(pdfium.PdfBitmap.to_pil)
-    bitmap_p1 = next(renderer)
-
-
-@pytest.mark.parametrize(
-    ("with_forms", "exp_color"),
-    [
-        (False, (255, 255, 255)),
-        (True, (0, 51, 113)),
-    ]
-)
-def test_render_form(with_forms, exp_color):
-    
-    pdf = pdfium.PdfDocument(TestFiles.form)
-    if with_forms:
-        pdf.init_forms()
-    
-    if with_forms:
-        assert isinstance(pdf.formenv, pdfium.PdfFormEnv)
-    else:
-        assert pdf.formenv is None
-    
-    page = pdf[0]
-    image = page.render(
-        may_draw_forms = with_forms,
-    ).to_pil()
-    
-    assert image.getpixel( (190, 190) ) == exp_color
-    assert image.getpixel( (190, 430) ) == exp_color
-    assert image.getpixel( (190, 480) ) == exp_color
-
-
-def test_numpy_nocopy(sample_page):
-    bitmap = sample_page.render(scale=0.1)
-    array = bitmap.to_numpy()
-    assert (bitmap.width, bitmap.height) == (60, 85)
-    val_a, val_b = 255, 123
-    assert array[0][0][0] == val_a
-    bitmap.buffer[0] = val_b
-    assert array[0][0][0] == val_b
-    array[0][0][0] = val_a
-    assert bitmap.buffer[0] == val_a
-
-
-@pytest.mark.parametrize(
-    ("bitmap_format", "rev_byteorder", "is_referenced"),
-    [
-        (pdfium_c.FPDFBitmap_BGR,  False, False),
-        (pdfium_c.FPDFBitmap_BGR,  True,  False),
-        (pdfium_c.FPDFBitmap_BGRA, False, False),
-        (pdfium_c.FPDFBitmap_BGRA, True,  True),
-        (pdfium_c.FPDFBitmap_BGRx, False, False),
-        (pdfium_c.FPDFBitmap_BGRx, True,  True),
-        (pdfium_c.FPDFBitmap_Gray, False, True),
-    ]
-)
-def test_pil_nocopy_where_possible(bitmap_format, rev_byteorder, is_referenced, sample_page):
-    
-    bitmap = sample_page.render(
-        scale = 0.1,
-        rev_byteorder = rev_byteorder,
-        force_bitmap_format = bitmap_format,
-    )
-    pil_image = bitmap.to_pil()
-    assert pil_image.size == (60, 85)
-    
-    val_a, val_b = 255, 123
-    if bitmap.n_channels == 4:
-        pixel_a = (val_a, 255, 255, 255)
-        pixel_b = (val_b, 255, 255, 255)
-    elif bitmap.n_channels == 3:
-        pixel_a = (val_a, 255, 255)
-        pixel_b = (val_b, 255, 255)
-    elif bitmap.n_channels == 1:
-        pixel_a = val_a
-        pixel_b = val_b
-    else:
-        assert False
-    
-    assert pil_image.getpixel((0, 0)) == pixel_a
-    bitmap.buffer[0] = val_b
-    
-    if is_referenced:
-        
-        # changes to the buffer are reflected in the image
-        assert pil_image.getpixel((0, 0)) == pixel_b
-        
-        # changes to the image are reflected in the buffer, since we set `.readonly = False` on after image init
-        pil_image.putpixel((0, 0), pixel_a)
-        assert pil_image.getpixel((0, 0)) == pixel_a
-        assert bitmap.buffer[0] == val_a
-        
-    else:
-        if pil_image.getpixel((0, 0)) == pixel_b:
-            warnings.warn(f"PIL now references {bitmap.mode} mode bitmaps.")
-        else:
-            assert pil_image.getpixel((0, 0)) == pixel_a
diff --git a/tests_old/test_saver.py b/tests_old/test_saver.py
deleted file mode 100644
index 331aa7918..000000000
--- a/tests_old/test_saver.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
-# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
-
-import io
-import pypdfium2 as pdfium
-import pypdfium2.raw as pdfium_c
-from .conftest import TestFiles, OutputDir
-
-
-def test_save():
-    
-    src_pdf = pdfium.PdfDocument(TestFiles.multipage)
-    new_pdf_raw = pdfium_c.FPDF_ImportNPagesToOne(
-        src_pdf.raw,
-        595, 842,
-        2, 2,
-    )
-    
-    new_pdf = pdfium.PdfDocument(new_pdf_raw)
-    assert len(new_pdf) == 1
-    page = new_pdf[0]
-    assert page.get_size() == (595, 842)
-    
-    output_file = OutputDir / "tiling.pdf"
-    new_pdf.save(output_file)
-    assert output_file.exists()
-    
-
-def test_save_withversion():
-    
-    pdf = pdfium.PdfDocument(TestFiles.multipage)
-    pre_id_p = pdf.get_identifier(pdfium_c.FILEIDTYPE_PERMANENT)
-    pre_id_c = pdf.get_identifier(pdfium_c.FILEIDTYPE_CHANGING)
-    assert isinstance(pre_id_p, bytes)
-    pdf.del_page(1)
-    
-    buffer = io.BytesIO()
-    pdf.save(buffer, version=17)
-    
-    buffer.seek(0)
-    data = buffer.read()
-    buffer.seek(0)
-    
-    exp_start = b"%PDF-1.7"
-    exp_end = b"%EOF\r\n"
-    assert data[:len(exp_start)] == exp_start
-    assert data[-len(exp_end):] == exp_end
-    
-    reopened_pdf = pdfium.PdfDocument(buffer, autoclose=True)
-    assert len(reopened_pdf) == 2
-    
-    post_id_p = reopened_pdf.get_identifier(pdfium_c.FILEIDTYPE_PERMANENT)
-    post_id_c = reopened_pdf.get_identifier(pdfium_c.FILEIDTYPE_CHANGING)
-    assert pre_id_p == post_id_p
-    assert pre_id_c != post_id_c
diff --git a/tests_old/test_text.py b/tests_old/test_text.py
deleted file mode 100644
index cb9bf086b..000000000
--- a/tests_old/test_text.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
-# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
-
-import re
-import pytest
-import pypdfium2 as pdfium
-from .conftest import TestFiles
-
-
-@pytest.fixture
-def text_pdf():
-    pdf = pdfium.PdfDocument(TestFiles.text)
-    yield pdf
-
-
-@pytest.fixture
-def textpage(text_pdf):
-    page = text_pdf[0]
-    textpage = page.get_textpage()
-    assert isinstance(textpage, pdfium.PdfTextPage)
-    yield textpage
-
-
-@pytest.fixture
-def linkpage(text_pdf):
-    page = text_pdf[1]
-    linkpage = page.get_textpage()
-    yield linkpage
-
-
-def test_gettext(textpage):
-    text_a = textpage.get_text_bounded()
-    text_b = textpage.get_text_range()
-    assert text_a == text_b
-    assert len(text_a) == 438
-    exp_start = "Lorem ipsum dolor sit amet,\r\n"
-    exp_end = "\r\nofficia deserunt mollit anim id est laborum."
-    assert text_a.startswith(exp_start)
-    assert text_a.endswith(exp_end)
-    text_start = textpage.get_text_range(0, len(exp_start))
-    text_end_a = textpage.get_text_range(len(text_a)-len(exp_end))  # count=-1
-    text_end_b = textpage.get_text_range(len(text_a)-len(exp_end), len(exp_end))
-    assert text_start == exp_start
-    assert text_end_a == text_end_b == exp_end
-
-
-@pytest.mark.parametrize("loose", [False, True])
-def test_getcharbox(textpage, loose):
-    for index in range(textpage.count_chars()):
-        box = textpage.get_charbox(index, loose=loose)
-        assert all( isinstance(val, (int, float)) for val in box )
-        assert box[0] <= box[2] and box[1] <= box[3]
-
-
-def test_getrectboxes(textpage):
-    n_rects = textpage.count_rects()
-    rects = [textpage.get_rect(i) for i in range(n_rects)]
-    assert len(rects) == 10
-    
-    first_rect = rects[0]
-    assert pytest.approx(first_rect, abs=1) == (58, 767, 258, 782)
-    first_text = textpage.get_text_bounded(*first_rect)
-    assert first_text == "Lorem ipsum dolor sit amet,"
-    assert textpage.get_text_range(0, len(first_text)) == first_text
-    
-    for rect in rects:
-        assert len(rect) == 4
-        assert 56 < rect[0] < 59
-        text = textpage.get_text_bounded(*rect)
-        assert isinstance(text, str)
-        assert len(text) <= 66
-    
-    assert text == "officia deserunt mollit anim id est laborum."
-    assert textpage.get_text_range(textpage.count_chars()-len(text))  # count=-1
-
-
-def _get_rects(textpage, search_result):
-    # TODO add helper?
-    if search_result is None:
-        return []
-    c_index, c_count = search_result
-    r_index = textpage.count_rects(0, c_index) - 1
-    r_count = textpage.count_rects(c_index, c_count)
-    textpage.count_rects()
-    rects = [textpage.get_rect(i) for i in range(r_index, r_index+r_count)]
-    return rects
-
-
-def test_search_text(textpage):
-    searcher = textpage.search("labor")
-    
-    occ_1a = searcher.get_next()
-    occ_2a = searcher.get_next()
-    occ_3a = searcher.get_next()
-    occ_4x = searcher.get_next()
-    occ_2b = searcher.get_prev()
-    occ_1b = searcher.get_prev()
-    
-    assert occ_1a == (89, 5)
-    assert occ_2a == (181, 5)
-    assert occ_3a == (430, 5)
-    assert occ_4x is None
-    assert occ_1a == occ_1b and occ_2a == occ_2b
-    
-    occs = (occ_1a, occ_2a, occ_3a)
-    exp_rectlists = [
-        [ (57, 675, 511, 690) ],
-        [ (58, 638, 537, 653) ],
-        [ (58, 549, 367, 561) ],
-    ]
-    
-    for occ, exp_rects in zip(occs, exp_rectlists):
-        rects = _get_rects(textpage, occ)
-        assert [pytest.approx(r, abs=0.5) for r in rects] == exp_rects
-
-
-def test_get_index(textpage):
-    
-    x, y = (60, textpage.page.get_height()-66)
-    
-    index = textpage.get_index(x, y, 5, 5)
-    assert index < textpage.count_chars() and index == 0
-    
-    charbox = textpage.get_charbox(index)
-    char = textpage.get_text_bounded(*charbox)
-    assert char == "L"
-
-
-def test_textpage_empty():
-    pdf = pdfium.PdfDocument(TestFiles.empty)
-    page = pdf[0]
-    textpage = page.get_textpage()
-    
-    assert textpage.get_text_bounded() == ""
-    assert textpage.get_text_range() == ""
-    assert textpage.count_chars() == 0
-    assert textpage.count_rects() == 0
-    assert textpage.get_index(0, 0, 0, 0) is None
-    
-    searcher = textpage.search("a")
-    assert searcher.get_next() is None
-    
-    with pytest.raises(pdfium.PdfiumError, match=re.escape("Failed to get charbox.")):
-        textpage.get_charbox(0)
-    with pytest.raises(ValueError, match=re.escape("Text length must be greater than 0.")):
-        textpage.search("")

From 06d45c18190d292d78dfe7f570922d1cb97691eb Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 8 Apr 2024 22:07:31 +0200
Subject: [PATCH 028/140] Add a somewhat elaborate posconv embeddertest

---
 src/pypdfium2/_helpers/page.py |  4 ++--
 tests/conftest.py              | 24 +++++++++++++++---------
 tests/test_pageobjects.py      |  8 +-------
 tests/test_rendering.py        | 29 ++++++++++++++++++++++++++++-
 4 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 1dc764622..e4b3f6598 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
-__all__ = ("PdfPage", )
+__all__ = ("PdfPage", "PdfPosConv")
 
 import math
 import ctypes
@@ -499,7 +499,7 @@ def _parse_renderopts(
     return cl_format, rev_byteorder, fill_color, flags
 
 
-class PdfPosConv:  # TODO add to test suite
+class PdfPosConv:
     """
     Pdf coordinate translator.
     
diff --git a/tests/conftest.py b/tests/conftest.py
index 0c7559351..6dbbffd43 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
 import sys
+import pytest
 from pathlib import Path
 from argparse import Namespace
 import pypdfium2.__main__ as pdfium_cli
@@ -29,19 +30,10 @@ def _gather_resources(dir, skip_exts=[".in"]):
         setattr(test_files, path.stem, (dir / path.name))
     return test_files
 
-
 TestFiles = _gather_resources(ResourceDir)
 TestExpectations = _gather_resources(ExpectationsDir)
 
 
-ExpRenderPixels = (
-    ( (0,   0  ), (255, 255, 255) ),
-    ( (150, 180), (129, 212, 26 ) ),
-    ( (150, 390), (42,  96,  153) ),
-    ( (150, 570), (128, 0,   128) ),
-)
-
-
 def get_members(cls):
     members = []
     for attr in dir(cls):
@@ -51,6 +43,20 @@ def get_members(cls):
     return members
 
 
+def compare_n2(data, exp_data, approx_abs=1):
+    assert len(data) == len(exp_data)
+    for d, exp_d in zip(data, exp_data):
+        assert pytest.approx(d, abs=approx_abs) == exp_d
+
+
+ExpRenderPixels = (
+    ( (0,   0  ), (255, 255, 255) ),
+    ( (150, 180), (129, 212, 26 ) ),
+    ( (150, 390), (42,  96,  153) ),
+    ( (150, 570), (128, 0,   128) ),
+)
+
+
 # def iterate_testfiles(skip_encrypted=True):
 #     encrypted = (TestFiles.encrypted, )
 #     for attr_name in dir(TestFiles):
diff --git a/tests/test_pageobjects.py b/tests/test_pageobjects.py
index b7e8451f3..b6902238b 100644
--- a/tests/test_pageobjects.py
+++ b/tests/test_pageobjects.py
@@ -7,13 +7,7 @@
 import PIL.Image
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
-from .conftest import TestFiles, OutputDir
-
-
-def compare_n2(data, exp_data):
-    assert len(data) == len(exp_data)
-    for d, exp_d in zip(data, exp_data):
-        assert pytest.approx(d, abs=1) == exp_d
+from .conftest import TestFiles, OutputDir, compare_n2
 
 
 def test_image_objects():
diff --git a/tests/test_rendering.py b/tests/test_rendering.py
index f8d75508c..d20eca341 100644
--- a/tests/test_rendering.py
+++ b/tests/test_rendering.py
@@ -5,6 +5,7 @@
 import numpy
 import warnings
 import PIL.Image
+import PIL.ImageDraw
 import pytest
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
@@ -12,7 +13,8 @@
     TestFiles,
     PyVersion,
     OutputDir,
-    ExpRenderPixels
+    ExpRenderPixels,
+    compare_n2,
 )
 
 # TODO assert that bitmap and info are consistent
@@ -379,3 +381,28 @@ def test_pil_nocopy_where_possible(bitmap_format, rev_byteorder, is_referenced,
             warnings.warn(f"PIL now references {bitmap.mode} mode bitmaps.")
         else:
             assert pil_image.getpixel((0, 0)) == pixel_a
+
+
+def test_draw_image_borders():
+    # this demonstrates posconv functionality
+    
+    pdf = pdfium.PdfDocument(TestFiles.images)
+    page = pdf[0]
+    images = list( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]) )
+    pdf_qpl = [i.get_quad_points() for i in images]
+    
+    bitmap = page.render(scale=1)
+    posconv = pdfium.PdfPosConv(page, bitmap)
+    pil_image = bitmap.to_pil()
+    bitmap_qpl  = [[posconv.to_bitmap(x, y) for x, y in qps] for qps in pdf_qpl]
+    
+    reverse_qpl = [[posconv.to_page(x, y) for x, y in qps] for qps in bitmap_qpl]
+    for qps_a, qps_b in zip(pdf_qpl, reverse_qpl):
+        compare_n2(qps_a, qps_b)
+    
+    draw = PIL.ImageDraw.Draw(pil_image)
+    GREEN = (50, 200, 10)
+    for qps in bitmap_qpl:
+        draw.polygon(qps, outline=GREEN, width=3)
+    
+    pil_image.save(OutputDir/"image_borders.png")

From 51ad9d078ca2f8130c838dcea46bae2de4eae8ee Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 8 Apr 2024 22:21:23 +0200
Subject: [PATCH 029/140] make posconv api more flexible

---
 src/pypdfium2/_helpers/page.py | 39 +++++++++++++++-------------------
 tests/test_page.py             | 19 +++++++----------
 tests/test_rendering.py        |  2 +-
 3 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index e4b3f6598..4e7ea6112 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -300,7 +300,7 @@ def get_objects(self, filter=None, max_depth=2, form=None, level=0):
                 )
     
     
-    # non-public because it doesn't really work (returns success but does nothing on all samples we tried)
+    # non-public because it doesn't seem to work (returns success but does nothing on the samples we tried)
     def _flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY):
         """
         Attempt to flatten annotations and form fields into the page contents.
@@ -316,12 +316,21 @@ def _flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY):
         return rc
     
     
+    def get_posconv(self, bitmap):
+        """
+        Acquire a :class:`.PdfPosConv` coordinate translator for a :class:`PdfBitmap` rendered from this page.
+        """
+        # if the bitmap was rendered from a page, resolve weakref and check identity
+        if not bitmap._pos_args or bitmap._pos_args[0]() is not self:
+            raise RuntimeError("The given bitmap does not belong to this page.")
+        return PdfPosConv(self, bitmap._pos_args[1:])
+    
+    
     # TODO
     # - add helpers for matrix-based and interruptible rendering
     # - add lower-level renderer that takes a caller-provided bitmap
     # e. g. render(), render_ex(), render_matrix(), render_matrix_ex()
     
-    
     def render(
             self,
             scale = 1,
@@ -329,7 +338,6 @@ def render(
             crop = (0, 0, 0, 0),
             may_draw_forms = True,
             bitmap_maker = PdfBitmap.new_native,
-            fill_to_stroke = False,
             **kwargs
         ):
         """
@@ -506,43 +514,30 @@ class PdfPosConv:
     Parameters:
         page (PdfPage):
             Handle to the page.
-        bitmap (PdfBitmap):
-            Handle to the bitmap, which must be a rendering of *page*.
+        pos_args (tuple[int*5]):
+            pdfium canvas args (start_x, start_y, size_x, size_y, rotate), as in ``FPDF_RenderPageBitmap()`` etc.
     """
     
-    # NOTE The reason for this API design is that neither page nor bitmap should hold a permanent reference to another, so they can be freed independently via finalizer. Obviously, a weak reference alone is not sufficient, as its object can disappear. So we need an explicit takeover of the page, ensuring it is held in memory.
-    
-    def __init__(self, page, bitmap):
-        
-        if not bitmap._pos_args:
-            raise RuntimeError("This bitmap does not belong to a page.")
-        
-        assert page != None
-        page_ref = bitmap._pos_args[0]
-        if page_ref() is not page:  # resolve weakref and check identity
-            raise RuntimeError("This bitmap was not rendered from the given page.")
-        
+    def __init__(self, page, pos_args):
         self.page = page
-        self._args = bitmap._pos_args[1:]
-    
+        self.pos_args = pos_args
     
     def to_page(self, bitmap_x, bitmap_y):
         """
         Translate coordinates from bitmap to page.
         """
         page_x, page_y = ctypes.c_double(), ctypes.c_double()
-        ok = pdfium_c.FPDF_DeviceToPage(self.page, *self._args, bitmap_x, bitmap_y, page_x, page_y)
+        ok = pdfium_c.FPDF_DeviceToPage(self.page, *self.pos_args, bitmap_x, bitmap_y, page_x, page_y)
         if not ok:
             raise PdfiumError("Failed to translate to page coordinates.")
         return (page_x.value, page_y.value)
     
-    
     def to_bitmap(self, page_x, page_y):
         """
         Translate coordinates from page to bitmap.
         """
         bitmap_x, bitmap_y = ctypes.c_int(), ctypes.c_int()
-        ok = pdfium_c.FPDF_PageToDevice(self.page, *self._args, page_x, page_y, bitmap_x, bitmap_y)
+        ok = pdfium_c.FPDF_PageToDevice(self.page, *self.pos_args, page_x, page_y, bitmap_x, bitmap_y)
         if not ok:
             raise PdfiumError("Failed to translate to bitmap coordinates.")
         return (bitmap_x.value, bitmap_y.value)
diff --git a/tests/test_page.py b/tests/test_page.py
index 9bde27c0a..4f5956d59 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -3,7 +3,7 @@
 
 import pytest
 import pypdfium2 as pdfium
-# import pypdfium2.raw as pdfium_c
+import pypdfium2.raw as pdfium_c
 from .conftest import TestFiles
 
 
@@ -52,13 +52,10 @@ def test_page_labels():
     assert exp_labels == [pdf.get_page_label(i) for i in range(len(pdf))]
 
 
-# # disabled because flattening takes no effect
-# def test_flatten():
-    
-#     pdf = pdfium.PdfDocument(TestFiles.form)
-#     page = pdf[0]
-    
-#     rc = page._flatten()
-#     assert rc == pdfium_c.FLATTEN_SUCCESS
-    
-#     # pdf.save(OutputDir / "flattened.pdf")
+# seems to take no effect; probably a pdfium bug
+def test_flatten():
+    pdf = pdfium.PdfDocument(TestFiles.forms)
+    page = pdf[0]
+    rc = page._flatten()
+    assert rc == pdfium_c.FLATTEN_SUCCESS
+    # pdf.save(OutputDir / "flattened.pdf")
diff --git a/tests/test_rendering.py b/tests/test_rendering.py
index d20eca341..88f3e6a4b 100644
--- a/tests/test_rendering.py
+++ b/tests/test_rendering.py
@@ -392,7 +392,7 @@ def test_draw_image_borders():
     pdf_qpl = [i.get_quad_points() for i in images]
     
     bitmap = page.render(scale=1)
-    posconv = pdfium.PdfPosConv(page, bitmap)
+    posconv = page.get_posconv(bitmap)
     pil_image = bitmap.to_pil()
     bitmap_qpl  = [[posconv.to_bitmap(x, y) for x, y in qps] for qps in pdf_qpl]
     

From 702b1984399d8d61896cdf63f7abc8a76b28bf7b Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 9 Apr 2024 14:44:02 +0200
Subject: [PATCH 030/140] update changelog

---
 docs/devel/changelog_staging.md    | 6 +++++-
 src/pypdfium2/_helpers/textpage.py | 6 ++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 152aa8582..79325a627 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -19,10 +19,14 @@
 - Removed legacy version flags.
 
 *Improvements and new features*
+- Added `PdfPosConv` helper for bidirectional translation between page and bitmap coordinates.
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
-- Simplified version implementation (no API change expected). Replaced `__getattr__` magic with assignments, so all attributes now show up in `dir()`.
+- Simplified version impl (no API change expected).
+
+*Project*
+- Merged `tests_old/` back into `tests/`.
 
 <!-- TODO
 See https://github.com/pypdfium2-team/pypdfium2/blob/devel_old/docs/devel/changelog_staging.md
diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py
index b1303fecd..82ab1eb63 100644
--- a/src/pypdfium2/_helpers/textpage.py
+++ b/src/pypdfium2/_helpers/textpage.py
@@ -293,15 +293,13 @@ def _get_occurrence(self, find_func):
     def get_next(self):
         """
         Returns:
-            (int, int): Start character index and count of the next occurrence,
-            or None if the last occurrence was passed.
+            (int, int) | None: Start character index and count of the next occurrence, or None if the last occurrence was passed.
         """
         return self._get_occurrence(pdfium_c.FPDFText_FindNext)
     
     def get_prev(self):
         """
         Returns:
-            (int, int): Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence),
-            or None if the last occurrence was passed.
+            (int, int) | None: Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence), or None if the last occurrence was passed.
         """
         return self._get_occurrence(pdfium_c.FPDFText_FindPrev)

From bb75dac486da1f0fb3a76c8384d92595ace31054 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 9 Apr 2024 15:05:04 +0200
Subject: [PATCH 031/140] Actually get flattening to work (it was just a usage
 mistake)

Turns out flattening requires forms init and FORM_OnAfterLoadPage().
Probably the latter call was added later than _flatten().

Enforce the requirement by checking self.formenv.
---
 docs/devel/changelog_staging.md    |  1 +
 src/pypdfium2/_helpers/document.py |  2 +-
 src/pypdfium2/_helpers/page.py     | 14 ++++++++++----
 tests/test_page.py                 | 14 +++++++-------
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 79325a627..8a32b02ab 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -21,6 +21,7 @@
 *Improvements and new features*
 - Added `PdfPosConv` helper for bidirectional translation between page and bitmap coordinates.
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
+- Added `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added an assertion to make sure requirements are met, and updated docs accordingly.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
 - Simplified version impl (no API change expected).
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index b9c6ccdc3..1804f96c3 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -363,7 +363,7 @@ def get_page(self, index):
             PdfPage: The page at *index* (zero-based).
         Note:
             This calls ``FORM_OnAfterLoadPage()`` if the document has an active form env.
-            The form env must not be closed before the page is closed!
+            Note that closing the formenv would implicitly close the page.
         """
         
         raw_page = pdfium_c.FPDF_LoadPage(self, index)
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 4e7ea6112..daa2f0ed4 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -25,6 +25,7 @@ class PdfPage (pdfium_i.AutoCloseable):
     Attributes:
         raw (FPDF_PAGE): The underlying PDFium page handle.
         pdf (PdfDocument): Reference to the document this page belongs to.
+        formenv (PdfFormEnv|None): Formenv handle, if the parent pdf had an active formenv at the time of page retrieval. None otherwise.
     """
     
     def __init__(self, raw, pdf, formenv):
@@ -300,16 +301,21 @@ def get_objects(self, filter=None, max_depth=2, form=None, level=0):
                 )
     
     
-    # non-public because it doesn't seem to work (returns success but does nothing on the samples we tried)
-    def _flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY):
+    def flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY):
         """
-        Attempt to flatten annotations and form fields into the page contents.
+        Flatten form fields and annotations into page contents.
+        
+        Attention:
+            :meth:`~.PdfDocument.init_forms` must have been called on the parent pdf, before the page was retrieved, for this method to work.
+            In other words, :attr:`.PdfPage.formenv` must be non-null.
         
         Parameters:
             flag (int): PDFium flattening target (:attr:`FLAT_*`)
         Returns:
             int: PDFium flattening status (:attr:`FLATTEN_*`). :attr:`FLATTEN_FAIL` is handled internally.
         """
+        if not self.formenv:
+            raise RuntimeError("page.flatten() requires previous pdf.init_forms() before page retrieval.")
         rc = pdfium_c.FPDFPage_Flatten(self, flag)
         if rc == pdfium_c.FLATTEN_FAIL:
             raise PdfiumError("Failed to flatten annotations / form fields.")
@@ -318,7 +324,7 @@ def _flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY):
     
     def get_posconv(self, bitmap):
         """
-        Acquire a :class:`.PdfPosConv` coordinate translator for a :class:`PdfBitmap` rendered from this page.
+        Acquire a :class:`.PdfPosConv` coordinate translator for a :class:`.PdfBitmap` rendered from this page.
         """
         # if the bitmap was rendered from a page, resolve weakref and check identity
         if not bitmap._pos_args or bitmap._pos_args[0]() is not self:
diff --git a/tests/test_page.py b/tests/test_page.py
index 4f5956d59..8fcdaa51f 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -4,7 +4,7 @@
 import pytest
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
-from .conftest import TestFiles
+from .conftest import TestFiles, OutputDir
 
 
 def test_boxes():
@@ -25,9 +25,9 @@ def test_boxes():
         ("art",   (40, 40, 555, 802)),
     ]
     
-    for meth_name, exp_box in test_cases:
-        getattr(page, "set_%sbox" % meth_name)(*exp_box)
-        box = getattr(page, "get_%sbox" % meth_name)()
+    for mn, exp_box in test_cases:
+        getattr(page, f"set_{mn}box")(*exp_box)
+        box = getattr(page, f"get_{mn}box")()
         assert pytest.approx(box) == exp_box
 
 
@@ -52,10 +52,10 @@ def test_page_labels():
     assert exp_labels == [pdf.get_page_label(i) for i in range(len(pdf))]
 
 
-# seems to take no effect; probably a pdfium bug
 def test_flatten():
     pdf = pdfium.PdfDocument(TestFiles.forms)
+    pdf.init_forms()
     page = pdf[0]
-    rc = page._flatten()
+    rc = page.flatten()
     assert rc == pdfium_c.FLATTEN_SUCCESS
-    # pdf.save(OutputDir / "flattened.pdf")
+    pdf.save(OutputDir / "flattened.pdf")

From 4bbea67d3efd93a81867700ed7960dfd6ac7c8ca Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 9 Apr 2024 15:17:50 +0200
Subject: [PATCH 032/140] slightly improve docs

---
 src/pypdfium2/_helpers/bitmap.py      | 6 ++++--
 src/pypdfium2/_helpers/document.py    | 4 ++--
 src/pypdfium2/_helpers/page.py        | 4 ++++
 src/pypdfium2/_helpers/pageobjects.py | 8 ++++----
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index 544ebff70..cb0568eed 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -117,11 +117,13 @@ def from_raw(cls, raw, rev_byteorder=False, ex_buffer=None):
     @classmethod
     def new_native(cls, width, height, format, rev_byteorder=False, buffer=None, stride=None):
         """
-        Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by Python/ctypes.
-        Bitmaps created by this function are always packed (no unused bytes at line end).
+        Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by Python/ctypes, or provided by the caller.
+        Buffers allocated by this function are packed (i.e. no unused bytes at line end).
+        If an external buffer is provided, stride may be set if there is a padding.
         """
         
         if stride is None:
+            assert buffer != None
             stride = width * pdfium_i.BitmapTypeToNChannels[format]
         if buffer is None:
             buffer = (ctypes.c_ubyte * (stride * height))()
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 1804f96c3..136abc658 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -363,7 +363,7 @@ def get_page(self, index):
             PdfPage: The page at *index* (zero-based).
         Note:
             This calls ``FORM_OnAfterLoadPage()`` if the document has an active form env.
-            Note that closing the formenv would implicitly close the page.
+            In that case, note that closing the formenv would implicitly close the page.
         """
         
         raw_page = pdfium_c.FPDF_LoadPage(self, index)
@@ -655,7 +655,7 @@ def get_count(self):
     def get_dest(self):
         """
         Returns:
-            PdfDest | None: The bookmark's destination (page index, viewport), or None on failure.
+            PdfDest | None: The bookmark's destination (an object providing page index and viewport), or None on failure.
         """
         raw_dest = pdfium_c.FPDFBookmark_GetDest(self.pdf, self)
         if not raw_dest:
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index daa2f0ed4..614131a82 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -517,6 +517,10 @@ class PdfPosConv:
     """
     Pdf coordinate translator.
     
+    Hint:
+        Use :meth:`.PdfPage.get_posconv` to obtain an instance of this class.
+        It is not normally necessary to access the :class:`.PdfPosConv` constructor directly.
+    
     Parameters:
         page (PdfPage):
             Handle to the page.
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index b5a46e73c..ac1e6e784 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -341,15 +341,15 @@ def extract(self, dest, *args, **kwargs):
         """
         Extract the image into an independently usable file or byte buffer, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits.
         
-        Only DCTDecode (JPEG) and JPXDecode (JPEG 2000) images can be extracted directly.
+        This method can only extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly.
         Otherwise, the pixel data is decoded, and re-encoded using :mod:`PIL`.
-        For images with simple filters only, ``get_data(decode_simple=True)`` is used for decoding to preserve higher bit depth or special color formats not supported by FPDF_BITMAP.
+        For images with simple filters only, ``get_data(decode_simple=True)`` is used for decoding to preserve higher bit depth or special color formats not supported by ``FPDF_BITMAP``.
         For images with complex filters, we have to resort to :meth:`.get_bitmap`, which can be a lossy operation.
         
-        Note, this method ignores alpha masks and some other data stored separately from the main data stream (e.g. BlackIsWhite), which might lead to incorrect representation of the image.
+        Note, this method ignores alpha masks, and potentially other data stored separately of the main data stream, which might lead to incorrect representation of the image.
         
         Parameters:
-            dest (str | io.BytesIO):
+            dest (str | pathlib.Path | io.BytesIO):
                 File prefix or byte buffer to which the image shall be written.
             fb_format (str):
                 The image format to use in case it is necessary to (re-)encode the data.

From 29776b7fbe4ea161fc05cc31d52749d52e4455fd Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 9 Apr 2024 15:58:13 +0200
Subject: [PATCH 033/140] bitmap: fix assertion blunder

added recently during branch devel
---
 src/pypdfium2/_helpers/bitmap.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index cb0568eed..1e7a9b1c6 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -122,10 +122,11 @@ def new_native(cls, width, height, format, rev_byteorder=False, buffer=None, str
         If an external buffer is provided, stride may be set if there is a padding.
         """
         
+        orig_stride = stride
         if stride is None:
-            assert buffer != None
             stride = width * pdfium_i.BitmapTypeToNChannels[format]
         if buffer is None:
+            assert orig_stride is None
             buffer = (ctypes.c_ubyte * (stride * height))()
         raw = pdfium_c.FPDFBitmap_CreateEx(width, height, format, buffer, stride)
         

From 238e425a8ef518752df5c9100cc412372c164ba1 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 9 Apr 2024 15:59:42 +0200
Subject: [PATCH 034/140] Add PdfPosConv unittest

---
 tests/test_page.py      | 19 +++++++++++++++++++
 tests/test_rendering.py |  3 ++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tests/test_page.py b/tests/test_page.py
index 8fcdaa51f..748a745c9 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -59,3 +59,22 @@ def test_flatten():
     rc = page.flatten()
     assert rc == pdfium_c.FLATTEN_SUCCESS
     pdf.save(OutputDir / "flattened.pdf")
+
+
+def test_posconv():
+    pdf = pdfium.PdfDocument.new()
+    W, H = 100, 150
+    page = pdf.new_page(W, H)
+    posconv = pdfium.PdfPosConv(page, (0, 0, W, H, 0))
+    page_corners = [
+        (0, 0),  # bottom left == origin
+        (W, 0),  # bottom right
+        (0, H),  # top left
+        (W, H),  # top right
+    ]
+    # bitmaps use top-left as origin
+    bmp_corners = [posconv.to_bitmap(x, y) for x, y in page_corners]
+    exp_bmp_corners = [(x, H-y) for x, y in page_corners]
+    assert bmp_corners == exp_bmp_corners
+    reverse_page_corners = [posconv.to_page(x, y) for x, y in bmp_corners]
+    assert reverse_page_corners == page_corners
diff --git a/tests/test_rendering.py b/tests/test_rendering.py
index 88f3e6a4b..d757d0b9a 100644
--- a/tests/test_rendering.py
+++ b/tests/test_rendering.py
@@ -384,7 +384,8 @@ def test_pil_nocopy_where_possible(bitmap_format, rev_byteorder, is_referenced,
 
 
 def test_draw_image_borders():
-    # this demonstrates posconv functionality
+    # this demonstrates posconv functionality in the form of an embedder test
+    # see test_page::test_posconv for a more unittest-like example
     
     pdf = pdfium.PdfDocument(TestFiles.images)
     page = pdf[0]

From c4756b83324aaaba7d6937918036ce326b741892 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 9 Apr 2024 16:03:29 +0200
Subject: [PATCH 035/140] update changelog & add task

---
 docs/devel/changelog_staging.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 8a32b02ab..b9980f85c 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -19,7 +19,8 @@
 - Removed legacy version flags.
 
 *Improvements and new features*
-- Added `PdfPosConv` helper for bidirectional translation between page and bitmap coordinates.
+<!-- TODO change to PdfBitmap.get_posconv(page) ? because we can't use it unless a bitmap has been written from the page -->
+- Added `PdfPosConv` helper and `PdfPage.get_posconv(bitmap)` for bidirectional translation between page and bitmap coordinates.
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
 - Added `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added an assertion to make sure requirements are met, and updated docs accordingly.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.

From cd064b94e85774678c6b00e1bbe6cf223d319a06 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 9 Apr 2024 16:57:57 +0200
Subject: [PATCH 036/140] Move get_posconv() to bitmap

---
 docs/devel/changelog_staging.md  |  3 +-
 src/pypdfium2/_helpers/bitmap.py | 53 ++++++++++++++++++++++++++++++--
 src/pypdfium2/_helpers/page.py   | 52 +------------------------------
 tests/test_rendering.py          |  2 +-
 4 files changed, 54 insertions(+), 56 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index b9980f85c..909d28b77 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -19,8 +19,7 @@
 - Removed legacy version flags.
 
 *Improvements and new features*
-<!-- TODO change to PdfBitmap.get_posconv(page) ? because we can't use it unless a bitmap has been written from the page -->
-- Added `PdfPosConv` helper and `PdfPage.get_posconv(bitmap)` for bidirectional translation between page and bitmap coordinates.
+- Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates.
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
 - Added `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added an assertion to make sure requirements are met, and updated docs accordingly.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index 1e7a9b1c6..f98fe76dd 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
-__all__ = ("PdfBitmap", )
+__all__ = ("PdfBitmap", "PdfPosConv")
 
 import ctypes
 import logging
@@ -79,6 +79,7 @@ def __init__(self, raw, buffer, width, height, stride, format, rev_byteorder, ne
     def parent(self):  # AutoCloseable hook
         return None
     
+    
     @classmethod
     def from_raw(cls, raw, rev_byteorder=False, ex_buffer=None):
         """
@@ -275,7 +276,16 @@ def from_pil(cls, pil_image):
         return cls.new_native(w, h, format, rev_byteorder=False, buffer=pil_image.tobytes())
     
     
-    # TODO implement from_numpy()
+    def get_posconv(self, page):
+        """
+        Acquire a :class:`.PdfPosConv` coordinate translator for this bitmap and the page it was rendered from.
+        
+        This API requires passing in the page explicitly, to avoid holding a strong reference, so that bitmap and page can be freed by finalizer independently.
+        """
+        # if the bitmap was rendered from a page, resolve weakref and check identity
+        if not self._pos_args or self._pos_args[0]() is not page:
+            raise RuntimeError("This bitmap does not belong to the given page.")
+        return PdfPosConv(page, self._pos_args[1:])
 
 
 def _pil_convert_for_pdfium(pil_image):
@@ -302,3 +312,42 @@ def _pil_convert_for_pdfium(pil_image):
         pil_image = PIL.Image.merge("RGBX", (b, g, r, x))
     
     return pil_image
+
+
+class PdfPosConv:
+    """
+    Pdf coordinate translator.
+    
+    Hint:
+        You may want to use :meth:`.PdfBitmap.get_posconv` to obtain an instance of this class.
+    
+    Parameters:
+        page (PdfPage):
+            Handle to the page.
+        pos_args (tuple[int*5]):
+            pdfium canvas args (start_x, start_y, size_x, size_y, rotate), as in ``FPDF_RenderPageBitmap()`` etc.
+    """
+    
+    def __init__(self, page, pos_args):
+        self.page = page
+        self.pos_args = pos_args
+    
+    def to_page(self, bitmap_x, bitmap_y):
+        """
+        Translate coordinates from bitmap to page.
+        """
+        page_x, page_y = ctypes.c_double(), ctypes.c_double()
+        ok = pdfium_c.FPDF_DeviceToPage(self.page, *self.pos_args, bitmap_x, bitmap_y, page_x, page_y)
+        if not ok:
+            raise PdfiumError("Failed to translate to page coordinates.")
+        return (page_x.value, page_y.value)
+    
+    def to_bitmap(self, page_x, page_y):
+        """
+        Translate coordinates from page to bitmap.
+        """
+        bitmap_x, bitmap_y = ctypes.c_int(), ctypes.c_int()
+        ok = pdfium_c.FPDF_PageToDevice(self.page, *self.pos_args, page_x, page_y, bitmap_x, bitmap_y)
+        if not ok:
+            raise PdfiumError("Failed to translate to bitmap coordinates.")
+        return (bitmap_x.value, bitmap_y.value)
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 614131a82..14b72a76a 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
-__all__ = ("PdfPage", "PdfPosConv")
+__all__ = ("PdfPage", )
 
 import math
 import ctypes
@@ -322,16 +322,6 @@ def flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY):
         return rc
     
     
-    def get_posconv(self, bitmap):
-        """
-        Acquire a :class:`.PdfPosConv` coordinate translator for a :class:`.PdfBitmap` rendered from this page.
-        """
-        # if the bitmap was rendered from a page, resolve weakref and check identity
-        if not bitmap._pos_args or bitmap._pos_args[0]() is not self:
-            raise RuntimeError("The given bitmap does not belong to this page.")
-        return PdfPosConv(self, bitmap._pos_args[1:])
-    
-    
     # TODO
     # - add helpers for matrix-based and interruptible rendering
     # - add lower-level renderer that takes a caller-provided bitmap
@@ -511,43 +501,3 @@ def _parse_renderopts(
     
     # TODO consider using a namedtuple or something
     return cl_format, rev_byteorder, fill_color, flags
-
-
-class PdfPosConv:
-    """
-    Pdf coordinate translator.
-    
-    Hint:
-        Use :meth:`.PdfPage.get_posconv` to obtain an instance of this class.
-        It is not normally necessary to access the :class:`.PdfPosConv` constructor directly.
-    
-    Parameters:
-        page (PdfPage):
-            Handle to the page.
-        pos_args (tuple[int*5]):
-            pdfium canvas args (start_x, start_y, size_x, size_y, rotate), as in ``FPDF_RenderPageBitmap()`` etc.
-    """
-    
-    def __init__(self, page, pos_args):
-        self.page = page
-        self.pos_args = pos_args
-    
-    def to_page(self, bitmap_x, bitmap_y):
-        """
-        Translate coordinates from bitmap to page.
-        """
-        page_x, page_y = ctypes.c_double(), ctypes.c_double()
-        ok = pdfium_c.FPDF_DeviceToPage(self.page, *self.pos_args, bitmap_x, bitmap_y, page_x, page_y)
-        if not ok:
-            raise PdfiumError("Failed to translate to page coordinates.")
-        return (page_x.value, page_y.value)
-    
-    def to_bitmap(self, page_x, page_y):
-        """
-        Translate coordinates from page to bitmap.
-        """
-        bitmap_x, bitmap_y = ctypes.c_int(), ctypes.c_int()
-        ok = pdfium_c.FPDF_PageToDevice(self.page, *self.pos_args, page_x, page_y, bitmap_x, bitmap_y)
-        if not ok:
-            raise PdfiumError("Failed to translate to bitmap coordinates.")
-        return (bitmap_x.value, bitmap_y.value)
diff --git a/tests/test_rendering.py b/tests/test_rendering.py
index d757d0b9a..e1ee21f82 100644
--- a/tests/test_rendering.py
+++ b/tests/test_rendering.py
@@ -393,7 +393,7 @@ def test_draw_image_borders():
     pdf_qpl = [i.get_quad_points() for i in images]
     
     bitmap = page.render(scale=1)
-    posconv = page.get_posconv(bitmap)
+    posconv = bitmap.get_posconv(page)
     pil_image = bitmap.to_pil()
     bitmap_qpl  = [[posconv.to_bitmap(x, y) for x, y in qps] for qps in pdf_qpl]
     

From ad4ec2c56a54546b497639cd7c27237c6eb68347 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 9 Apr 2024 18:39:48 +0200
Subject: [PATCH 037/140] Add experimental position normalizer

---
 docs/devel/changelog_staging.md  |  1 +
 src/pypdfium2/_helpers/bitmap.py |  6 ++-
 src/pypdfium2/_helpers/page.py   | 65 +++++++++++++++++++++++++++++++-
 3 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 909d28b77..ce0d4c8da 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -20,6 +20,7 @@
 
 *Improvements and new features*
 - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates.
+- Also added `PdfPosNormalizer` and `PdfPage.get_pos_normalizer()` as a wrapper around `PdfPosConv`.
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
 - Added `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added an assertion to make sure requirements are met, and updated docs accordingly.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index f98fe76dd..a9343a6d9 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -278,9 +278,9 @@ def from_pil(cls, pil_image):
     
     def get_posconv(self, page):
         """
-        Acquire a :class:`.PdfPosConv` coordinate translator for this bitmap and the page it was rendered from.
+        Acquire a :class:`.PdfPosConv` object to translate between coordinates on the bitmap and the page it was rendered from.
         
-        This API requires passing in the page explicitly, to avoid holding a strong reference, so that bitmap and page can be freed by finalizer independently.
+        This method requires passing in the page explicitly, to avoid holding a strong reference, so that bitmap and page can be independently freed by finalizer.
         """
         # if the bitmap was rendered from a page, resolve weakref and check identity
         if not self._pos_args or self._pos_args[0]() is not page:
@@ -328,6 +328,8 @@ class PdfPosConv:
             pdfium canvas args (start_x, start_y, size_x, size_y, rotate), as in ``FPDF_RenderPageBitmap()`` etc.
     """
     
+    # FIXME would we have to do overflow checking against too large sizes?
+    
     def __init__(self, page, pos_args):
         self.page = page
         self.pos_args = pos_args
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 14b72a76a..49e455a3c 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
-__all__ = ("PdfPage", )
+__all__ = ("PdfPage", "PdfPosNormalizer")
 
 import math
 import ctypes
@@ -10,7 +10,7 @@
 import pypdfium2.raw as pdfium_c
 import pypdfium2.internal as pdfium_i
 from pypdfium2._helpers.misc import PdfiumError
-from pypdfium2._helpers.bitmap import PdfBitmap
+from pypdfium2._helpers.bitmap import PdfBitmap, PdfPosConv
 from pypdfium2._helpers.textpage import PdfTextPage
 from pypdfium2._helpers.pageobjects import PdfObject
 
@@ -322,6 +322,31 @@ def flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY):
         return rc
     
     
+    def get_pos_normalizer(self, ps=5, origin="bottom_left"):
+        """
+        Set up a coordinate normalizer object that may be used to apply PDF coordinate system transformations to values, or unapply them.
+        
+        This may be useful when writing PDF position data to a format that assumes a strict coordinate system, or to conveniently translate visual input values to raw values (e.g. swapping crop for a page with rotated/mirrored coordinate system).
+        
+        Note, as pdfium itself does not currently expose a generic coordinate normalizer, we are absusing the page <-> raster translator APIs by supplying a fictional raster of a certain scale, which is rather inelegant, as there is some back-and-forth calculation and an inherent loss of precision (though it can be made irrelevantly small), due to interjection of the raster.
+        
+        Conversely, this means you should not use this method for translating to/from an actual bitmap. Instead, use :meth:`.PdfBitmap.get_posconv`/:class:`PdfPosConv` directly, to avoid even more unnecessary calculation.
+        
+        Parameters:
+            ps (float):
+                Scale factor to use for the fictional raster. Controls the precision of normalized values.
+            origin (str):
+                The corner to use as origin (``bottom_left`` or ``top_left``).
+                The underlying pdfium API works with top left, but the default here is bottom left so that raw and normalized values align for a non-transformed coordinate system.
+        Returns:
+            PdfPosNormalizer
+        """
+        w, h = self.get_size()
+        w, h = round(w*ps), round(h*ps)
+        posconv = PdfPosConv(self, (0, 0, w, h, 0))
+        return PdfPosNormalizer(posconv, ps, origin)
+    
+    
     # TODO
     # - add helpers for matrix-based and interruptible rendering
     # - add lower-level renderer that takes a caller-provided bitmap
@@ -501,3 +526,39 @@ def _parse_renderopts(
     
     # TODO consider using a namedtuple or something
     return cl_format, rev_byteorder, fill_color, flags
+
+
+class PdfPosNormalizer:
+    """
+    Pdf coordinate normalizer.
+    See :meth:`.PdfPage.get_pos_normalizer` for description.
+    """
+
+    def __init__(self, posconv, ps, origin):
+        self._posconv = posconv
+        self._ps = ps
+        if origin == "top_left":
+            self._translate_y = lambda y: y
+        elif origin == "bottom_left":
+            size_y = posconv.pos_args[3]
+            self._translate_y = lambda y: size_y - y
+        else:
+            raise ValueError(f"Origin {origin!r} is not a supported corner.")
+    
+    def to_norm(self, raw_x, raw_y):
+        """
+        Translate raw to normalized coordinates. This applies coordinate system transformations.
+        """
+        x, y = self._posconv.to_bitmap(raw_x, raw_y)
+        x = x / self._ps
+        y = self._translate_y(y) / self._ps
+        return x, y
+    
+    def to_raw(self, norm_x, norm_y):
+        """
+        Translate normalized to raw coordinates.
+        This unapplies coordinate system transformations by doing the inverse transformation.
+        """
+        x = round(norm_x * self._ps)
+        y = round(self._translate_y(norm_y * self._ps))
+        return self._posconv.to_page(x, y)

From 00a738bb28be2e3d1e588802b391a60672960f7a Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 9 Apr 2024 23:49:33 +0200
Subject: [PATCH 038/140] docs: include changelog_staging also with non-main
 branches

The decisive aspect is not what branch we are on, but whether we are on
a tagged codebase or not.
---
 docs/source/changelog.rst                  |  2 +-
 docs/source/conf.py                        | 39 ++++++----------------
 docs/source/index.rst                      |  2 +-
 setupsrc/pypdfium2_setup/autorelease.py    | 19 ++---------
 setupsrc/pypdfium2_setup/packaging_base.py | 15 +++++++++
 5 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index b1ca25645..5c526855b 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -4,7 +4,7 @@
 Changelog
 =========
 
-.. ifconfig:: build_type == 'latest'
+.. ifconfig:: have_changes
     
     .. warning::
          This is a documentation build for an unreleased version of pypdfium2, so it is possible that new changes are not logged yet.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 18a452957..b87e1db9c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -5,7 +5,6 @@
 # See https://www.sphinx-doc.org/en/master/usage/configuration.html
 # and https://docs.readthedocs.io/en/stable/environment-variables.html
 
-import os
 import sys
 import time
 import collections
@@ -13,27 +12,15 @@
 
 sys.path.insert(0, str(Path(__file__).parents[2] / "setupsrc"))
 from pypdfium2_setup.packaging_base import (
-    run_cmd,
-    ProjectDir,
+    parse_git_tag,
+    get_next_changelog,
 )
 
-
-def _get_build_type():
-    
-    # RTD uses git checkout --force origin/... which results in a detached HEAD state, so we cannot easily get the branch name
-    # Thus query for an RTD-specific environment variable instead
-    rtd_vn = os.environ.get("READTHEDOCS_VERSION_NAME", None)
-    if rtd_vn:
-        return rtd_vn
-    
-    branch = run_cmd(["git", "branch", "--show-current"], cwd=ProjectDir, capture=True)
-    if branch == "main":
-        return "latest"
-    else:
-        return branch
-
-
-build_type = _get_build_type()
+# FIXME not sure if this will work on RTD
+tag_info = parse_git_tag()
+have_changes = tag_info["n_commits"] > 0 or tag_info["dirty"]
+if get_next_changelog():
+    assert have_changes
 
 project = "pypdfium2"
 author = "pypdfium2-team"
@@ -81,14 +68,10 @@ def _get_build_type():
 
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-rst_prolog
 # .. |br| raw:: html
-
 #    <br/>
-rst_prolog = """
-.. |build_type| replace:: %(build_type)s
-""" % dict(
-    build_type = build_type,
-)
-
+rst_prolog = f"""
+.. |have_changes| replace:: {have_changes}
+"""
 
 def remove_namedtuple_aliases(app, what, name, obj, skip, options):
     if type(obj) is collections._tuplegetter:
@@ -98,4 +81,4 @@ def remove_namedtuple_aliases(app, what, name, obj, skip, options):
 
 def setup(app):
     app.connect('autodoc-skip-member', remove_namedtuple_aliases)
-    app.add_config_value("build_type", "latest", "env")
+    app.add_config_value("have_changes", True, "env")
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b67e35815..60d4e81ea 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -4,7 +4,7 @@
 pypdfium2
 =========
 
-Welcome to the documentation for the support model of pypdfium2 (|build_type| build).
+Welcome to the documentation for the support model of pypdfium2.
 
 .. toctree::
    :maxdepth: 2
diff --git a/setupsrc/pypdfium2_setup/autorelease.py b/setupsrc/pypdfium2_setup/autorelease.py
index 85d94639a..520999cdc 100644
--- a/setupsrc/pypdfium2_setup/autorelease.py
+++ b/setupsrc/pypdfium2_setup/autorelease.py
@@ -151,21 +151,6 @@ def make_releasenotes(summary, prev_pdfium, new_pdfium, prev_tag, new_tag, c_upd
     (ProjectDir/"RELEASE.md").write_text(relnotes)
 
 
-def get_changelog_staging(beta):
-    
-    content = ChangelogStaging.read_text()
-    pos = content.index("\n", content.index("# Changelog")) + 1
-    header = content[:pos].strip() + "\n"
-    devel_msg = content[pos:].strip()
-    if devel_msg:
-        devel_msg += "\n"
-    
-    if beta is None:  # flush
-        ChangelogStaging.write_text(header)
-    
-    return devel_msg
-
-
 def main():
     
     parser = argparse.ArgumentParser(
@@ -193,7 +178,9 @@ def main():
     write_json(AR_RecordFile, dict(pdfium=new_pdfium, tag=new_tag))
     
     update_refbindings(latest_pdfium)
-    summary = get_changelog_staging(new_helpers["beta"])
+    summary = get_next_changelog(
+        flush = new_helpers["beta"] is None
+    )
     log_changes(summary, record["pdfium"], new_pdfium, new_tag, new_helpers["beta"])
     if args.register:
         register_changes(new_tag)
diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py
index a8221a3ca..099a9ac59 100644
--- a/setupsrc/pypdfium2_setup/packaging_base.py
+++ b/setupsrc/pypdfium2_setup/packaging_base.py
@@ -633,3 +633,18 @@ def parse_modspec(modspec):
     else:
         modnames = ModulesAll
     return modnames
+
+
+def get_next_changelog(flush=False):
+    
+    content = ChangelogStaging.read_text()
+    pos = content.index("\n", content.index("# Changelog")) + 1
+    header = content[:pos].strip() + "\n"
+    devel_msg = content[pos:].strip()
+    if devel_msg:
+        devel_msg += "\n"
+    
+    if flush:
+        ChangelogStaging.write_text(header)
+    
+    return devel_msg

From 6ec5bb43073b615712720fbf25ab1d1f80980e48 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 10 Apr 2024 00:07:01 +0200
Subject: [PATCH 039/140] XXX print out tag info

---
 docs/source/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index b87e1db9c..629f8dcaf 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -18,6 +18,7 @@
 
 # FIXME not sure if this will work on RTD
 tag_info = parse_git_tag()
+print(tag_info, file=sys.stderr)
 have_changes = tag_info["n_commits"] > 0 or tag_info["dirty"]
 if get_next_changelog():
     assert have_changes

From 08f05d6ad7ce728533b56a22112605bfe246c51b Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 10 Apr 2024 00:10:16 +0200
Subject: [PATCH 040/140] XXX show git status

---
 docs/source/conf.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 629f8dcaf..0653d17cf 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -14,11 +14,14 @@
 from pypdfium2_setup.packaging_base import (
     parse_git_tag,
     get_next_changelog,
+    run_cmd,
+    ProjectDir,
 )
 
 # FIXME not sure if this will work on RTD
 tag_info = parse_git_tag()
 print(tag_info, file=sys.stderr)
+print(run_cmd(["git", "status"], cwd=ProjectDir, capture=True), file=sys.stderr)
 have_changes = tag_info["n_commits"] > 0 or tag_info["dirty"]
 if get_next_changelog():
     assert have_changes

From 953d45442651105798b9f72f4128480db632e658 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 10 Apr 2024 00:18:23 +0200
Subject: [PATCH 041/140] continue on RTD

---
 docs/source/conf.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0653d17cf..d309fa770 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -5,6 +5,7 @@
 # See https://www.sphinx-doc.org/en/master/usage/configuration.html
 # and https://docs.readthedocs.io/en/stable/environment-variables.html
 
+import os
 import sys
 import time
 import collections
@@ -14,15 +15,13 @@
 from pypdfium2_setup.packaging_base import (
     parse_git_tag,
     get_next_changelog,
-    run_cmd,
-    ProjectDir,
 )
 
-# FIXME not sure if this will work on RTD
+
+# RTD modifies conf.py, so we have to ignore dirty state if on RTD
+is_rtd = os.environ.get("READTHEDOCS", "").lower() == "true"
 tag_info = parse_git_tag()
-print(tag_info, file=sys.stderr)
-print(run_cmd(["git", "status"], cwd=ProjectDir, capture=True), file=sys.stderr)
-have_changes = tag_info["n_commits"] > 0 or tag_info["dirty"]
+have_changes = tag_info["n_commits"] > 0 or (tag_info["dirty"] and not is_rtd)
 if get_next_changelog():
     assert have_changes
 

From d72f49823e1e6fbdbcf7d25cd9a4cb0fd5418b4e Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 10 Apr 2024 01:37:24 +0200
Subject: [PATCH 042/140] slightly improve docs

---
 docs/source/conf.py                   |  1 -
 src/pypdfium2/_helpers/document.py    |  5 ++---
 src/pypdfium2/_helpers/page.py        | 23 +++++++++++++----------
 src/pypdfium2/_helpers/pageobjects.py |  2 +-
 src/pypdfium2/_helpers/textpage.py    | 11 +++++++----
 5 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index d309fa770..1ca0449fa 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -17,7 +17,6 @@
     get_next_changelog,
 )
 
-
 # RTD modifies conf.py, so we have to ignore dirty state if on RTD
 is_rtd = os.environ.get("READTHEDOCS", "").lower() == "true"
 tag_info = parse_git_tag()
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 136abc658..776437ea1 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -144,9 +144,8 @@ def init_forms(self, config=None):
         Initialize a form env, if the document has forms. If already initialized, nothing will be done.
         See the :attr:`formenv` attribute.
     
-        Note:
-            If form rendering is desired, this method should be called directly after constructing the document,
-            before getting any page handles (due to PDFium's API).
+        Attention:
+            If form rendering is desired, this method must be called after constructing the document, before getting any page handles.
         
         Parameters:
             config (FPDF_FORMFILLINFO | None):
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 49e455a3c..56ac13471 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -23,9 +23,12 @@ class PdfPage (pdfium_i.AutoCloseable):
     Page helper class.
     
     Attributes:
-        raw (FPDF_PAGE): The underlying PDFium page handle.
-        pdf (PdfDocument): Reference to the document this page belongs to.
-        formenv (PdfFormEnv|None): Formenv handle, if the parent pdf had an active formenv at the time of page retrieval. None otherwise.
+        raw (FPDF_PAGE):
+            The underlying PDFium page handle.
+        pdf (PdfDocument):
+            Reference to the document this page belongs to.
+        formenv (PdfFormEnv | None):
+            Formenv handle, if the parent pdf had an active formenv at the time of page retrieval. None otherwise.
     """
     
     def __init__(self, raw, pdf, formenv):
@@ -101,9 +104,9 @@ def get_mediabox(self, fallback_ok=True):
             (float, float, float, float) | None:
             The page MediaBox in PDF canvas units, consisting of four coordinates (usually x0, y0, x1, y1).
             If MediaBox is not defined, returns ANSI A (0, 0, 612, 792) if ``fallback_ok=True``, None otherwise.
-        Note:
-            Due to quirks in PDFium's public API, all ``get_*box()`` functions except :meth:`.get_bbox`
-            do not inherit from parent nodes in the page tree (as of PDFium 5418).
+        
+        .. admonition:: Known issue\n
+            Due to quirks in PDFium, all ``get_*box()`` functions except :meth:`.get_bbox` do not inherit from parent nodes in the page tree (as of PDFium 5418).
         """
         # https://crbug.com/pdfium/1786
         return self._get_box(pdfium_c.FPDFPage_GetMediaBox, lambda: (0, 0, 612, 792), fallback_ok)
@@ -266,7 +269,7 @@ def get_objects(self, filter=None, max_depth=2, form=None, level=0):
             :class:`.PdfObject`: A page object.
         """
         
-        # TODO? close skipped objects explicitly ?
+        # TODO close skipped objects explicitly ?
         
         if form:
             count_objects = pdfium_c.FPDFFormObj_CountObjects
@@ -326,11 +329,11 @@ def get_pos_normalizer(self, ps=5, origin="bottom_left"):
         """
         Set up a coordinate normalizer object that may be used to apply PDF coordinate system transformations to values, or unapply them.
         
-        This may be useful when writing PDF position data to a format that assumes a strict coordinate system, or to conveniently translate visual input values to raw values (e.g. swapping crop for a page with rotated/mirrored coordinate system).
+        This may be useful to conveniently translate visual input values to raw values (e.g. swapping crop for a page with rotated/mirrored coordinate system), or when passing position data to a receiver that assumes a strict coordinate system.
         
         Note, as pdfium itself does not currently expose a generic coordinate normalizer, we are absusing the page <-> raster translator APIs by supplying a fictional raster of a certain scale, which is rather inelegant, as there is some back-and-forth calculation and an inherent loss of precision (though it can be made irrelevantly small), due to interjection of the raster.
         
-        Conversely, this means you should not use this method for translating to/from an actual bitmap. Instead, use :meth:`.PdfBitmap.get_posconv`/:class:`PdfPosConv` directly, to avoid even more unnecessary calculation.
+        Conversely, this means you should not use this method for translating to/from an actual bitmap. Instead, use :meth:`.PdfBitmap.get_posconv`/:class:`.PdfPosConv` directly, to avoid even more unnecessary calculation.
         
         Parameters:
             ps (float):
@@ -339,7 +342,7 @@ def get_pos_normalizer(self, ps=5, origin="bottom_left"):
                 The corner to use as origin (``bottom_left`` or ``top_left``).
                 The underlying pdfium API works with top left, but the default here is bottom left so that raw and normalized values align for a non-transformed coordinate system.
         Returns:
-            PdfPosNormalizer
+            PdfPosNormalizer: Position normalization helper.
         """
         w, h = self.get_size()
         w, h = round(w*ps), round(h*ps)
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index ac1e6e784..d1be171e0 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -343,7 +343,7 @@ def extract(self, dest, *args, **kwargs):
         
         This method can only extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly.
         Otherwise, the pixel data is decoded, and re-encoded using :mod:`PIL`.
-        For images with simple filters only, ``get_data(decode_simple=True)`` is used for decoding to preserve higher bit depth or special color formats not supported by ``FPDF_BITMAP``.
+        For images with simple filters only, ``get_data(decode_simple=True)`` is used to preserve higher bit depth or special color formats not supported by ``FPDF_BITMAP``.
         For images with complex filters, we have to resort to :meth:`.get_bitmap`, which can be a lossy operation.
         
         Note, this method ignores alpha masks, and potentially other data stored separately of the main data stream, which might lead to incorrect representation of the image.
diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py
index 82ab1eb63..9d9170fbe 100644
--- a/src/pypdfium2/_helpers/textpage.py
+++ b/src/pypdfium2/_helpers/textpage.py
@@ -20,8 +20,10 @@ class PdfTextPage (pdfium_i.AutoCloseable):
     Text page helper class.
     
     Attributes:
-        raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
-        page (PdfPage): Reference to the page this textpage belongs to.
+        raw (FPDF_TEXTPAGE):
+            The underlying PDFium textpage handle.
+        page (PdfPage):
+            Reference to the page this textpage belongs to.
     """
     
     def __init__(self, raw, page):
@@ -211,8 +213,9 @@ def get_charbox(self, index, loose=False):
     def get_rect(self, index):
         """
         Get the bounding box of a text rectangle at the given index.
-        Note that :meth:`.count_rects` must be called once with default parameters
-        before subsequent :meth:`.get_rect` calls for this function to work (due to PDFium's API).
+
+        Attention:
+            :meth:`.count_rects` must be called once with default params before subsequent :meth:`.get_rect` calls for this function to work.
         
         Returns:
             Float values for left, bottom, right and top in PDF canvas units.

From ea9b3ad33782e4562a097f6b6c3c82f64d7303bd Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 10 Apr 2024 18:06:01 +0200
Subject: [PATCH 043/140] Improve PdfBitmap.new_native() logic

The buffer calculation is stride-agnostic, so it's fine to mix with a
custom stride.

Permit this and add assertions for the caller-provided cases.
---
 src/pypdfium2/_helpers/bitmap.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index a9343a6d9..5d93b9bdf 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -119,16 +119,25 @@ def from_raw(cls, raw, rev_byteorder=False, ex_buffer=None):
     def new_native(cls, width, height, format, rev_byteorder=False, buffer=None, stride=None):
         """
         Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by Python/ctypes, or provided by the caller.
-        Buffers allocated by this function are packed (i.e. no unused bytes at line end).
-        If an external buffer is provided, stride may be set if there is a padding.
+        
+        If buffer and stride are None, a packed buffer is created.
+        If buffer is None but a custom stride is given, a stride-agnostic buffer is created.
+        If both custom buffer and stride are given, they are used as-is.
+        
+        Caller-provided buffers or strides are subject to a logical validation.
         """
         
-        orig_stride = stride
+        bpc = pdfium_i.BitmapTypeToNChannels[format]
         if stride is None:
-            stride = width * pdfium_i.BitmapTypeToNChannels[format]
+            stride = width * bpc
+        else:
+            assert stride >= width * bpc
+        
         if buffer is None:
-            assert orig_stride is None
             buffer = (ctypes.c_ubyte * (stride * height))()
+        else:
+            assert len(buffer) >= stride * height
+        
         raw = pdfium_c.FPDFBitmap_CreateEx(width, height, format, buffer, stride)
         
         # alternatively, we could call the constructor directly with the information from above

From 0d9e47843dcd54e7cdc8fae780a13210d1791652 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 10 Apr 2024 23:21:21 +0200
Subject: [PATCH 044/140] Warn about pos normalizer having to be re-created

---
 src/pypdfium2/_helpers/page.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 56ac13471..4d20283d5 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -335,6 +335,9 @@ def get_pos_normalizer(self, ps=5, origin="bottom_left"):
         
         Conversely, this means you should not use this method for translating to/from an actual bitmap. Instead, use :meth:`.PdfBitmap.get_posconv`/:class:`.PdfPosConv` directly, to avoid even more unnecessary calculation.
         
+        Attention:
+            Whenever modifications to page geometry were made, the object has to be re-created to update the underlying fictional raster.
+        
         Parameters:
             ps (float):
                 Scale factor to use for the fictional raster. Controls the precision of normalized values.

From 8532ce63d32a4eadb55aeaa2c584609c3b87fd52 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 10 Apr 2024 23:27:23 +0200
Subject: [PATCH 045/140] bases: style nits

---
 src/pypdfium2/internal/bases.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py
index e1a70dcdb..51ae38fcb 100644
--- a/src/pypdfium2/internal/bases.py
+++ b/src/pypdfium2/internal/bases.py
@@ -26,8 +26,7 @@ class AutoCastable:
     
     @property
     def _as_parameter_(self):
-        # TODO tighten to `not isinstance(...)` (needs declaraction of C type)
-        if not self.raw:
+        if self.raw is None:
             raise RuntimeError("Cannot use closed object as C function parameter.")
         return self.raw
 
@@ -43,7 +42,7 @@ def _close_template(close_func, raw, obj_repr, state, parent, *args, **kwargs):
         os.write(sys.stderr.fileno(), f"-> Cannot close object, library is destroyed. This may cause a memory leak!\n".encode())
         return
     
-    assert (parent is None) or not parent._tree_closed()
+    assert parent is None or not parent._tree_closed()
     close_func(raw, *args, **kwargs)
 
 
@@ -51,7 +50,7 @@ class AutoCloseable (AutoCastable):
     
     def __init__(self, close_func, *args, obj=None, needs_free=True, **kwargs):
         
-        # NOTE proactively prevent accidental double initialization
+        # proactively prevent accidental double initialization
         assert not hasattr(self, "_finalizer")
         
         self._close_func = close_func
@@ -72,7 +71,7 @@ def __repr__(self):
     
     
     def _attach_finalizer(self):
-        # NOTE this function captures the value of the `parent` property at finalizer installation time - if it changes, detach the old finalizer and create a new one
+        # NOTE this function captures the value of the `parent` property at finalizer installation time
         assert self._finalizer is None
         self._finalizer = weakref.finalize(self._obj, _close_template, self._close_func, self.raw, repr(self), self._autoclose_state, self.parent, *self._ex_args, **self._ex_kwargs)
     

From 30afc7969fad34ff7859ac3646881672b4d92532 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 10 Apr 2024 23:36:57 +0200
Subject: [PATCH 046/140] Remove PdfPosNormalizer experiment

The fact that the object has to be re-created in response to any change
in page geometry renders it basically unusable IMO.

The whole approach was bad practice anyway. The correct way to do this
would be to patch pdfium with new APIs that don't do the raster rounding
and origin translation.

FWIW, I made a private copy of the code.
---
 docs/devel/changelog_staging.md |  1 -
 src/pypdfium2/_helpers/page.py  | 68 +--------------------------------
 2 files changed, 2 insertions(+), 67 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index ce0d4c8da..909d28b77 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -20,7 +20,6 @@
 
 *Improvements and new features*
 - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates.
-- Also added `PdfPosNormalizer` and `PdfPage.get_pos_normalizer()` as a wrapper around `PdfPosConv`.
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
 - Added `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added an assertion to make sure requirements are met, and updated docs accordingly.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 4d20283d5..81bcd5420 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
-__all__ = ("PdfPage", "PdfPosNormalizer")
+__all__ = ("PdfPage")
 
 import math
 import ctypes
@@ -10,7 +10,7 @@
 import pypdfium2.raw as pdfium_c
 import pypdfium2.internal as pdfium_i
 from pypdfium2._helpers.misc import PdfiumError
-from pypdfium2._helpers.bitmap import PdfBitmap, PdfPosConv
+from pypdfium2._helpers.bitmap import PdfBitmap
 from pypdfium2._helpers.textpage import PdfTextPage
 from pypdfium2._helpers.pageobjects import PdfObject
 
@@ -325,34 +325,6 @@ def flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY):
         return rc
     
     
-    def get_pos_normalizer(self, ps=5, origin="bottom_left"):
-        """
-        Set up a coordinate normalizer object that may be used to apply PDF coordinate system transformations to values, or unapply them.
-        
-        This may be useful to conveniently translate visual input values to raw values (e.g. swapping crop for a page with rotated/mirrored coordinate system), or when passing position data to a receiver that assumes a strict coordinate system.
-        
-        Note, as pdfium itself does not currently expose a generic coordinate normalizer, we are absusing the page <-> raster translator APIs by supplying a fictional raster of a certain scale, which is rather inelegant, as there is some back-and-forth calculation and an inherent loss of precision (though it can be made irrelevantly small), due to interjection of the raster.
-        
-        Conversely, this means you should not use this method for translating to/from an actual bitmap. Instead, use :meth:`.PdfBitmap.get_posconv`/:class:`.PdfPosConv` directly, to avoid even more unnecessary calculation.
-        
-        Attention:
-            Whenever modifications to page geometry were made, the object has to be re-created to update the underlying fictional raster.
-        
-        Parameters:
-            ps (float):
-                Scale factor to use for the fictional raster. Controls the precision of normalized values.
-            origin (str):
-                The corner to use as origin (``bottom_left`` or ``top_left``).
-                The underlying pdfium API works with top left, but the default here is bottom left so that raw and normalized values align for a non-transformed coordinate system.
-        Returns:
-            PdfPosNormalizer: Position normalization helper.
-        """
-        w, h = self.get_size()
-        w, h = round(w*ps), round(h*ps)
-        posconv = PdfPosConv(self, (0, 0, w, h, 0))
-        return PdfPosNormalizer(posconv, ps, origin)
-    
-    
     # TODO
     # - add helpers for matrix-based and interruptible rendering
     # - add lower-level renderer that takes a caller-provided bitmap
@@ -532,39 +504,3 @@ def _parse_renderopts(
     
     # TODO consider using a namedtuple or something
     return cl_format, rev_byteorder, fill_color, flags
-
-
-class PdfPosNormalizer:
-    """
-    Pdf coordinate normalizer.
-    See :meth:`.PdfPage.get_pos_normalizer` for description.
-    """
-
-    def __init__(self, posconv, ps, origin):
-        self._posconv = posconv
-        self._ps = ps
-        if origin == "top_left":
-            self._translate_y = lambda y: y
-        elif origin == "bottom_left":
-            size_y = posconv.pos_args[3]
-            self._translate_y = lambda y: size_y - y
-        else:
-            raise ValueError(f"Origin {origin!r} is not a supported corner.")
-    
-    def to_norm(self, raw_x, raw_y):
-        """
-        Translate raw to normalized coordinates. This applies coordinate system transformations.
-        """
-        x, y = self._posconv.to_bitmap(raw_x, raw_y)
-        x = x / self._ps
-        y = self._translate_y(y) / self._ps
-        return x, y
-    
-    def to_raw(self, norm_x, norm_y):
-        """
-        Translate normalized to raw coordinates.
-        This unapplies coordinate system transformations by doing the inverse transformation.
-        """
-        x = round(norm_x * self._ps)
-        y = round(self._translate_y(norm_y * self._ps))
-        return self._posconv.to_page(x, y)

From 1fcbfd81667df834522ede507e8099c9b1932aaa Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 11 Apr 2024 19:06:20 +0200
Subject: [PATCH 047/140] fix `__all__` blunder

("...") reduces to "...", which is also iterable (but quite wrong),
leading to `AttributeError: module 'pypdfium2._helpers.page' has no
attribute 'P'`

Perhaps it would be smarter to use lists rather than tuples anyway.
---
 src/pypdfium2/_helpers/page.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 81bcd5420..4f3906fef 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
-__all__ = ("PdfPage")
+__all__ = ("PdfPage", )
 
 import math
 import ctypes

From 4fe9d0d9c2e4e1a6809563e0f0415f8fd554aafb Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 11 Apr 2024 19:12:10 +0200
Subject: [PATCH 048/140] get_count(): fix doc blunder

It is supposed to be the number of direct children only (which,
incidentally, also makes more sense).
Not sure how I got it into my mind this would be recursive.
---
 src/pypdfium2/_helpers/document.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 776437ea1..87ae513da 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -646,7 +646,7 @@ def get_title(self):
     def get_count(self):
         """
         Returns:
-            int: Signed number of child bookmarks, recursively counting all members in the subtree. Zero if the bookmark has no descendants.
+            int: Signed number of direct child bookmarks (i.e. non-recursive). Zero if the bookmark has no descendants.
             The initial state shall be closed (collapsed) if negative, open (expanded) if positive.
         """
         return pdfium_c.FPDFBookmark_GetCount(self)

From ccc0b1804e5505e3a807b30406ef4c5f8021dea1 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 11 Apr 2024 19:15:46 +0200
Subject: [PATCH 049/140] move up helper function

---
 src/pypdfium2/_helpers/document.py | 52 +++++++++++++++---------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 87ae513da..27d94fbee 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -529,6 +529,32 @@ def get_toc(
             bm_ptr = pdfium_c.FPDFBookmark_GetNextSibling(self, bm_ptr)
 
 
+def _open_pdf(input_data, password, autoclose):
+    
+    to_hold, to_close = (), ()
+    if password is not None:
+        password = (password+"\x00").encode("utf-8")
+    
+    if isinstance(input_data, Path):
+        pdf = pdfium_c.FPDF_LoadDocument((str(input_data)+"\x00").encode("utf-8"), password)
+    elif isinstance(input_data, (bytes, ctypes.Array)):
+        pdf = pdfium_c.FPDF_LoadMemDocument64(input_data, len(input_data), password)
+        to_hold = (input_data, )
+    elif pdfium_i.is_buffer(input_data, "r"):
+        bufaccess, to_hold = pdfium_i.get_bufreader(input_data)
+        if autoclose:
+            to_close = (input_data, )
+        pdf = pdfium_c.FPDF_LoadCustomDocument(bufaccess, password)
+    else:
+        raise TypeError(f"Invalid input type '{type(input_data).__name__}'")
+    
+    if pdfium_c.FPDF_GetPageCount(pdf) < 1:
+        err_code = pdfium_c.FPDF_GetLastError()
+        raise PdfiumError(f"Failed to load document (PDFium: {pdfium_i.ErrorToStr.get(err_code)}).")
+    
+    return pdf, to_hold, to_close
+
+
 class PdfFormEnv (pdfium_i.AutoCloseable):
     """
     Form environment helper class.
@@ -589,32 +615,6 @@ def as_pageobject(self):
         return PdfObject(raw=raw_pageobj, pdf=self.pdf)
 
 
-def _open_pdf(input_data, password, autoclose):
-    
-    to_hold, to_close = (), ()
-    if password is not None:
-        password = (password+"\x00").encode("utf-8")
-    
-    if isinstance(input_data, Path):
-        pdf = pdfium_c.FPDF_LoadDocument((str(input_data)+"\x00").encode("utf-8"), password)
-    elif isinstance(input_data, (bytes, ctypes.Array)):
-        pdf = pdfium_c.FPDF_LoadMemDocument64(input_data, len(input_data), password)
-        to_hold = (input_data, )
-    elif pdfium_i.is_buffer(input_data, "r"):
-        bufaccess, to_hold = pdfium_i.get_bufreader(input_data)
-        if autoclose:
-            to_close = (input_data, )
-        pdf = pdfium_c.FPDF_LoadCustomDocument(bufaccess, password)
-    else:
-        raise TypeError(f"Invalid input type '{type(input_data).__name__}'")
-    
-    if pdfium_c.FPDF_GetPageCount(pdf) < 1:
-        err_code = pdfium_c.FPDF_GetLastError()
-        raise PdfiumError(f"Failed to load document (PDFium: {pdfium_i.ErrorToStr.get(err_code)}).")
-    
-    return pdf, to_hold, to_close
-
-
 class PdfBookmark (pdfium_i.AutoCastable):
     """
     Bookmark helper class.

From bf3e6164eb1b113561ca9b062c3f3b85372714ec Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 11 Apr 2024 19:17:33 +0200
Subject: [PATCH 050/140] update changelog

---
 docs/devel/changelog_staging.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 909d28b77..c5bb8a9bd 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -28,6 +28,7 @@
 
 *Project*
 - Merged `tests_old/` back into `tests/`.
+- Docs: Improved logic when to include the unreleased version warning and upcoming changelog.
 
 <!-- TODO
 See https://github.com/pypdfium2-team/pypdfium2/blob/devel_old/docs/devel/changelog_staging.md

From 2d90a63a4ba85a80beaf296ff36f367c0039c2d7 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 16 Apr 2024 15:53:42 +0200
Subject: [PATCH 051/140] Add error code annotation to PdfiumError (CC #308)

---
 src/pypdfium2/_helpers/document.py |  5 +++--
 src/pypdfium2/_helpers/misc.py     | 15 +++++++++++++--
 tests/test_document.py             |  4 +++-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 27d94fbee..b9bece529 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -33,7 +33,7 @@ class PdfDocument (pdfium_i.AutoCloseable):
             Whether byte buffer input should be automatically closed on finalization.
     
     Raises:
-        PdfiumError: Raised if the document failed to load. The exception message is annotated with the reason reported by PDFium.
+        PdfiumError: Raised if the document failed to load. The exception is annotated with the reason reported by PDFium (via message and :attr:`~.PdfiumError.err_code`).
         FileNotFoundError: Raised if an invalid or non-existent file path was given.
     
     Hint:
@@ -178,6 +178,7 @@ def init_forms(self, config=None):
             if "XFA" in PDFIUM_INFO.flags:
                 ok = pdfium_c.FPDF_LoadXFA(self)
                 if not ok:
+                    # FIXME ability to propagate an optional exception with error code info?
                     err = pdfium_c.FPDF_GetLastError()
                     logger.warning(f"FPDF_LoadXFA() failed with {pdfium_i.XFAErrorToStr.get(err)}")
             else:
@@ -550,7 +551,7 @@ def _open_pdf(input_data, password, autoclose):
     
     if pdfium_c.FPDF_GetPageCount(pdf) < 1:
         err_code = pdfium_c.FPDF_GetLastError()
-        raise PdfiumError(f"Failed to load document (PDFium: {pdfium_i.ErrorToStr.get(err_code)}).")
+        raise PdfiumError(f"Failed to load document (PDFium: {pdfium_i.ErrorToStr.get(err_code)}).", err_code=err_code)
     
     return pdf, to_hold, to_close
 
diff --git a/src/pypdfium2/_helpers/misc.py b/src/pypdfium2/_helpers/misc.py
index 370522141..9b19e0d54 100644
--- a/src/pypdfium2/_helpers/misc.py
+++ b/src/pypdfium2/_helpers/misc.py
@@ -5,5 +5,16 @@
 
 
 class PdfiumError (RuntimeError):
-    """ An exception from the PDFium library, detected by function return code. """
-    pass
+    """
+    An exception from the PDFium library, detected by function return code.
+    
+    Attributes:
+        err_code (int | None): PDFium error code, for programmatic handling of error subtypes, if provided by the API in question (e.g. document loading). None otherwise.
+    
+    Tip:
+        Use ``str(exc)`` to get the message of a caught exception.
+    """
+    
+    def __init__(self, msg, err_code=None):
+        super().__init__(msg)
+        self.err_code = err_code
diff --git a/tests/test_document.py b/tests/test_document.py
index d7bc54bf0..c7738ad86 100644
--- a/tests/test_document.py
+++ b/tests/test_document.py
@@ -138,8 +138,10 @@ def test_open_invalid():
         pdf = pdfium.PdfDocument(123)
     with pytest.raises(FileNotFoundError):
         pdf = pdfium.PdfDocument("invalid/path")
-    with pytest.raises(pdfium.PdfiumError, match=re.escape("Failed to load document (PDFium: Incorrect password error).")):
+    with pytest.raises(pdfium.PdfiumError, match=re.escape("Failed to load document (PDFium: Incorrect password error).")) as e:
         pdf = pdfium.PdfDocument(TestFiles.encrypted, password="wrong_password")
+    e = e.value
+    assert e.err_code == pdfium_c.FPDF_ERR_PASSWORD
 
 
 def test_misc():

From 3f40c1761206cb71c8b59b2fd2fb5f8f57b2ae97 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 17 Apr 2024 22:16:46 +0200
Subject: [PATCH 052/140] Prepare `get_text_range()` for pdfium change

See https://bugs.chromium.org/p/pdfium/issues/detail?id=2133#c13
---
 src/pypdfium2/_cli/extract_text.py |  2 +-
 src/pypdfium2/_helpers/textpage.py | 27 ++++++++++++---------------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/src/pypdfium2/_cli/extract_text.py b/src/pypdfium2/_cli/extract_text.py
index f212b6a6f..738645660 100644
--- a/src/pypdfium2/_cli/extract_text.py
+++ b/src/pypdfium2/_cli/extract_text.py
@@ -30,7 +30,7 @@ def main(args):
         
         # TODO let caller pass in possible range/boundary parameters
         if args.strategy == EXTRACT_RANGE:
-            text = textpage.get_text_range(force_this=True)
+            text = textpage.get_text_range()
         elif args.strategy == EXTRACT_BOUNDED:
             text = textpage.get_text_bounded()
         else:
diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py
index 9d9170fbe..35c358bf4 100644
--- a/src/pypdfium2/_helpers/textpage.py
+++ b/src/pypdfium2/_helpers/textpage.py
@@ -5,10 +5,10 @@
 
 import ctypes
 import logging
-import warnings
 import pypdfium2.raw as pdfium_c
 import pypdfium2.internal as pdfium_i
 from pypdfium2._helpers.misc import PdfiumError
+from pypdfium2.version import PDFIUM_INFO
 
 c_double = ctypes.c_double
 
@@ -52,14 +52,8 @@ def _get_active_text_range(self, c_start, c_end, l_passive=0, r_passive=0):
         return t_start, t_end, l_passive, r_passive
     
     
-    def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False):
+    def get_text_range(self, index=0, count=-1, errors="ignore"):
         """
-        Warning:
-            .. versionchanged:: 4.28
-               Unexpected upstream changes have caused allocation size concerns with this API.
-               Using it is now discouraged unless you specifically need to extract a character range. Prefer :meth:`.get_text_bounded` where possible.
-               Calling this method with default params now implicitly translates to :meth:`.get_text_bounded` (pass ``force_this=True`` to circumvent).
-        
         Extract text from a given range.
         
         Parameters:
@@ -77,12 +71,6 @@ def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False):
             * In case of leading/trailing excluded characters, pypdfium2 modifies *index* and *count* accordingly to prevent pdfium from unexpectedly reading beyond ``range(index, index+count)``.
         """
         
-        # https://github.com/pypdfium2-team/pypdfium2/issues/298
-        # https://crbug.com/pdfium/2133
-        if (index, count) == (0, -1) and not force_this:
-            warnings.warn("get_text_range() call with default params will be implicitly redirected to get_text_bounded()")
-            return self.get_text_bounded(errors=errors)
-        
         if count == -1:
             count = self.count_chars() - index
         
@@ -96,7 +84,16 @@ def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False):
         t_start, t_end, l_passive, r_passive = active_range
         index += l_passive
         count -= l_passive + r_passive
-        in_count = (t_end+1 - t_start)*2 + 1
+        in_count = t_end+1 - t_start
+        
+        # pdfium fea01fa9e2 to d6a4b27d80 requires assuming 4 bytes per character
+        # this corresponds to approx. >6167,<6415 or pdfium-binaries >=6191,<=6406
+        # https://github.com/pypdfium2-team/pypdfium2/issues/298
+        # https://crbug.com/pdfium/2133
+        # -> NOTE(geisserml) may be removed once pdfium-binaries > d6a4b27d80 is released
+        if 6167 < PDFIUM_INFO.build < 6415:
+            in_count *= 2
+        in_count += 1  # null terminator
         
         buffer = ctypes.create_string_buffer(in_count * 2)
         buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))

From 2338365f695bef3affcf30d9e9ab041068dc4e2c Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 19 Apr 2024 17:36:12 +0200
Subject: [PATCH 053/140] docs

---
 docs/devel/changelog_staging.md | 4 +++-
 src/pypdfium2/_helpers/page.py  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index c5bb8a9bd..5df9d36ca 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -21,8 +21,10 @@
 *Improvements and new features*
 - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates.
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
-- Added `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added an assertion to make sure requirements are met, and updated docs accordingly.
+- Exposed `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added check and updated docs accordingly.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
+- If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype.
+- Restored `get_text_range()` to its pre-v4.28 behavior, as pdfium reverted `FPDFText_GetText()` to UCS-2.
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
 - Simplified version impl (no API change expected).
 
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 4f3906fef..d34322be0 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -355,7 +355,7 @@ def render(
                 Amount in PDF canvas units to cut off from page borders (left, bottom, right, top). Crop is applied after rotation.
                 
             may_draw_forms (bool):
-                If True, render form fields (provided the document has forms and :meth:`~PdfDocument.init_forms` was called).
+                If True, render form fields (provided the document has forms and :meth:`~.PdfDocument.init_forms` was called).
             
             bitmap_maker (typing.Callable):
                 Callback function used to create the :class:`.PdfBitmap`.

From 2bd4757c325e18da7ab856a0fa14009c3360f4f6 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 21 Apr 2024 15:56:03 +0200
Subject: [PATCH 054/140] CLI/render: fix XFA document length recognition

Sample file:
https://www.canada.ca/content/dam/ircc/migration/ircc/english/pdf/kits/forms/imm5708e.pdf
---
 docs/devel/changelog_staging.md    | 3 +++
 src/pypdfium2/_cli/_parsers.py     | 4 +++-
 src/pypdfium2/_cli/render.py       | 5 +----
 src/pypdfium2/_helpers/document.py | 2 +-
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 5df9d36ca..2c2600df2 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -28,6 +28,9 @@
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
 - Simplified version impl (no API change expected).
 
+*Bug fixes*
+- XFA / rendering CLI: Fixed incorrect recognition of document length. `pdf.init_forms()` must be called before `len(pdf)`.
+
 *Project*
 - Merged `tests_old/` back into `tests/`.
 - Docs: Improved logic when to include the unreleased version warning and upcoming changelog.
diff --git a/src/pypdfium2/_cli/_parsers.py b/src/pypdfium2/_cli/_parsers.py
index 97997dbc6..abffe4e5d 100644
--- a/src/pypdfium2/_cli/_parsers.py
+++ b/src/pypdfium2/_cli/_parsers.py
@@ -82,8 +82,10 @@ def add_n_digits(parser):
     )
 
 
-def get_input(args, **kwargs):
+def get_input(args, init_forms=False, **kwargs):
     pdf = pdfium.PdfDocument(args.input, password=args.password, **kwargs)
+    if init_forms:
+        pdf.init_forms()
     if "pages" in args and not args.pages:
         args.pages = [i for i in range(len(pdf))]
     return pdf
diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 7c400b791..7d3977f64 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -244,7 +244,7 @@ def main(args):
     
     # TODO turn into a python-usable API yielding output paths as they are written
     
-    pdf = get_input(args)
+    pdf = get_input(args, init_forms=args.draw_forms)
     
     # TODO move to parsers?
     pdf_len = len(pdf)
@@ -301,9 +301,6 @@ def main(args):
     if len(args.pages) <= args.linear:
         
         logger.info("Linear rendering ...")
-        if args.draw_forms:
-            pdf.init_forms()
-        
         for i in args.pages:
             _render_job(i, pdf, kwargs, engine)
         
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index b9bece529..0ab49b85c 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -145,7 +145,7 @@ def init_forms(self, config=None):
         See the :attr:`formenv` attribute.
     
         Attention:
-            If form rendering is desired, this method must be called after constructing the document, before getting any page handles.
+            If form rendering is desired, this method shall be called immediately after document construction, before getting document length or page handles.
         
         Parameters:
             config (FPDF_FORMFILLINFO | None):

From 875f9117b09531f6337e8a1bac6e926eac1b0489 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 21 Apr 2024 23:01:25 +0200
Subject: [PATCH 055/140] add minor note on matrix multiplication (e, f)

---
 src/pypdfium2/_helpers/matrix.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pypdfium2/_helpers/matrix.py b/src/pypdfium2/_helpers/matrix.py
index de489857b..9ba1de292 100644
--- a/src/pypdfium2/_helpers/matrix.py
+++ b/src/pypdfium2/_helpers/matrix.py
@@ -88,6 +88,7 @@ def multiply(self, other):
             b = self.a*other.b + self.b*other.d,
             c = self.c*other.a + self.d*other.c,
             d = self.c*other.b + self.d*other.d,
+            # corresponds to: e, f = other.on_point(self.e, self.f) - transforms X/Y translation
             e = self.e*other.a + self.f*other.c + other.e,
             f = self.e*other.b + self.f*other.d + other.f,
         )

From cfb2f0d04ef615d838af731d8178a88fa1231657 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 23 Apr 2024 13:56:52 +0200
Subject: [PATCH 056/140] posconv: ensure page is non-null

---
 src/pypdfium2/_helpers/bitmap.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index 5d93b9bdf..43f297dde 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -337,9 +337,10 @@ class PdfPosConv:
             pdfium canvas args (start_x, start_y, size_x, size_y, rotate), as in ``FPDF_RenderPageBitmap()`` etc.
     """
     
-    # FIXME would we have to do overflow checking against too large sizes?
+    # FIXME do we have to do overflow checking against too large sizes?
     
     def __init__(self, page, pos_args):
+        assert bool(page)
         self.page = page
         self.pos_args = pos_args
     

From b5367c01e8c6fff852f86de7b8c7928d7bb474ef Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 1 May 2024 15:24:31 +0200
Subject: [PATCH 057/140] textpage/search: allow passthrough of caller flags

---
 src/pypdfium2/_helpers/textpage.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py
index 35c358bf4..6b67f0c3e 100644
--- a/src/pypdfium2/_helpers/textpage.py
+++ b/src/pypdfium2/_helpers/textpage.py
@@ -224,7 +224,7 @@ def get_rect(self, index):
         return (l.value, b.value, r.value, t.value)
     
     
-    def search(self, text, index=0, match_case=False, match_whole_word=False, consecutive=False):
+    def search(self, text, index=0, match_case=False, match_whole_word=False, consecutive=False, flags=0):
         """
         Locate text on the page.
         
@@ -240,6 +240,8 @@ def search(self, text, index=0, match_case=False, match_whole_word=False, consec
             consecutive (bool):
                 If False (the default), :meth:`.search` will skip past the current match to look for the next match.
                 If True, parts of the previous match may be caught again (e. g. searching for `aa` in `aaaa` would match 3 rather than 2 times).
+            flags (int):
+                Passthrough of raw pdfium searching flags. Note that you may want to use the boolean options instead.
         Returns:
             PdfTextSearcher: A helper object to search text.
         """
@@ -247,7 +249,6 @@ def search(self, text, index=0, match_case=False, match_whole_word=False, consec
         if len(text) == 0:
             raise ValueError("Text length must be greater than 0.")
         
-        flags = 0
         if match_case:
             flags |= pdfium_c.FPDF_MATCHCASE
         if match_whole_word:

From 1ce93c3842ad3bcb32bc4cdf549796be57287c0b Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 1 May 2024 15:41:55 +0200
Subject: [PATCH 058/140] cli/pageobjects: skip empty pages

Restore the previous behavior that empty pages would be silently skipped
rather than printing a page header without body.

Before, this was handled via a somewhat crude preamble concept.
---
 src/pypdfium2/_cli/pageobjects.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/pypdfium2/_cli/pageobjects.py b/src/pypdfium2/_cli/pageobjects.py
index 176f5c44d..933fe0ab8 100644
--- a/src/pypdfium2/_cli/pageobjects.py
+++ b/src/pypdfium2/_cli/pageobjects.py
@@ -3,6 +3,7 @@
 
 # TODO test-confirm filter and info params
 
+from itertools import chain
 from collections import OrderedDict
 import pypdfium2._helpers as pdfium
 import pypdfium2.internal as pdfium_i
@@ -83,10 +84,16 @@ def main(args):
         
         page = pdf[i]
         obj_searcher = page.get_objects(args.filter, max_depth=args.max_depth)
+        # note, more_itertools.peekable() could handle this more elegantly
+        try:
+            first_obj = next(obj_searcher)
+        except StopIteration:
+            continue
+        
         print(f"# Page {i+1}")
         count = 0
         
-        for obj in obj_searcher:
+        for obj in chain([first_obj], obj_searcher):
             
             pad_0 = "    " * obj.level
             pad_1 = pad_0 + "    "

From df23f6c003fe6f720921b975034919a3bb8ca124 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 1 May 2024 19:14:17 +0200
Subject: [PATCH 059/140] Remove planned changes

---
 docs/source/index.rst          |  3 +--
 docs/source/planned_changes.md | 10 ----------
 2 files changed, 1 insertion(+), 12 deletions(-)
 delete mode 100644 docs/source/planned_changes.md

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 60d4e81ea..f67141a8c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -16,9 +16,8 @@ Welcome to the documentation for the support model of pypdfium2.
 
 .. toctree::
    :maxdepth: 1
-   :caption: Progress
+   :caption: Release Notes
    
-   planned_changes
    changelog
 
 
diff --git a/docs/source/planned_changes.md b/docs/source/planned_changes.md
deleted file mode 100644
index 4a6ea384c..000000000
--- a/docs/source/planned_changes.md
+++ /dev/null
@@ -1,10 +0,0 @@
-<!-- SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com> -->
-<!-- SPDX-License-Identifier: CC-BY-4.0 -->
-
-<!-- TODO remove planned_changes.md and merge content into some other file -->
-
-# Planned Changes
-
-To find out about possible planned changes, you can ...
-* Search the codebase for `TODO(apibreak)`.
-* Check if there is a development branch. If so, take a look at its changelog (`docs/devel/changelog_staging.md`).

From d577349cf99f48cf4da11a9711444654e20dbd16 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 5 May 2024 19:22:27 +0200
Subject: [PATCH 060/140] minor readme improvements

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index caeeecfc2..6a953820e 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct
     PDFIUM_PLATFORM="sourcebuild" python -m pip install -v .
     ```
     Building PDFium may take a long time, as it comes with its bundled toolchain and deps, rather than taking them from the system.[^pdfium_buildsystem]
-    However, we can at least provide the `--use-syslibs` option to build against system-provided runtime libraries.
+    However, we can at least provide the `--use-syslibs` option to build against system runtime libraries.
   
   * <a id="user-content-install-source-system" class="anchor" href="#install-source-system">With system-provided binary 🔗</a>
     ```bash
@@ -98,14 +98,14 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct
   
   See [Setup Magic](#setup-magic) for details.
   
-  Support for source installs (esp. with self-built/system pdfium) is limited, as their integrity depends somewhat on a correctly acting caller.
+  Support for source installs (esp. with self-built/system pdfium) is limited, as their integrity somewhat depends on a correctly acting caller.
   
-  Installing an `sdist` does not implicitly trigger a sourcebuild if no pre-built binary is available. It is preferred to let callers decide consciously what to do, and run the build script without pip encapsulation.
+  Installing an `sdist` does not implicitly trigger a sourcebuild if no pre-built binary is available. We prefer to let callers decide consciously what to do, and run the build script without pip encapsulation.
   
   Relevant pip options:
   * `-v`: Verbose logging output. Useful for debugging.
   * `-e`: Install in editable mode, so the installation points to the source tree. This way, changes directly take effect without needing to re-install. Recommended for development.
-  * `--no-build-isolation`: Do not isolate setup in a virtual env; use the main env instead. This renders `pyproject.toml [build-system]` inactive, setup deps must be prepared by caller. Useful to install custom versions of setup deps, or as speedup when installing repeatedly.
+  * `--no-build-isolation`: Do not isolate setup in a virtual env; use the main env instead. This renders `pyproject.toml [build-system]` inactive, so setup deps must be prepared by caller. Useful to install custom versions of setup deps, or as speedup when installing repeatedly.
   
   [^pdfium_buildsystem]: This means pdfium may not compile on arbitrary hosts. The script is limited to build hosts supported by Google's toolchain. Ideally, we'd need an alternative build system that runs with system packages instead.
 
@@ -129,7 +129,8 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct
     conda install pypdfium2-team::pypdfium2_helpers --override-channels -c pypdfium2-team -c bblanchon
     ```
     
-    Adding the channels permanently and tightening priority is encouraged to include pypdfium2 in `conda update` by default, and to avoid accidentally replacing the install with a different channel. (If desired, you may limit the channel config to the current environment by adding `--env`.)
+    If desired, you may limit the channel config to the current environment by adding `--env`.
+    Adding the channels permanently and tightening priority is encouraged to include pypdfium2 in `conda update` by default, and to avoid accidentally replacing the install with a different channel.
     Otherwise, you should be cautious when making changes to the environment.
   
   + To depend on pypdfium2 in a `conda-build` recipe

From 640e80a9efe287f857e4ef9964aa2cc3d0218d47 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 5 May 2024 23:38:50 +0200
Subject: [PATCH 061/140] CLI/arrange: rm pointless var, better release
 implicit fh

---
 src/pypdfium2/_cli/arrange.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/pypdfium2/_cli/arrange.py b/src/pypdfium2/_cli/arrange.py
index ec7108447..5e2b050b3 100644
--- a/src/pypdfium2/_cli/arrange.py
+++ b/src/pypdfium2/_cli/arrange.py
@@ -41,11 +41,9 @@ def main(args):
         args.passwords.append(None)
     
     dest_pdf = pdfium.PdfDocument.new()
-    index = 0
     
     for in_path, pages, password in zip(args.inputs, args.pages, args.passwords):
-        src_pdf = pdfium.PdfDocument(in_path, password=password)
-        dest_pdf.import_pages(src_pdf, pages=pages)
-        index += len(src_pdf)
+        with pdfium.PdfDocument(in_path, password=password) as src_pdf:
+            dest_pdf.import_pages(src_pdf, pages=pages)
     
     dest_pdf.save(args.output)

From 8437cfd22ab2b9c807fba252ef7bbd0579770060 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 5 May 2024 23:46:10 +0200
Subject: [PATCH 062/140] CLI: clean up some comments

---
 src/pypdfium2/_cli/arrange.py        | 1 -
 src/pypdfium2/_cli/attachments.py    | 1 -
 src/pypdfium2/_cli/extract_images.py | 1 -
 src/pypdfium2/_cli/extract_text.py   | 1 -
 src/pypdfium2/_cli/pdfinfo.py        | 1 -
 src/pypdfium2/_cli/render.py         | 6 +-----
 src/pypdfium2/_cli/tile.py           | 1 -
 src/pypdfium2/_cli/toc.py            | 1 -
 8 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/pypdfium2/_cli/arrange.py b/src/pypdfium2/_cli/arrange.py
index 5e2b050b3..9261a3006 100644
--- a/src/pypdfium2/_cli/arrange.py
+++ b/src/pypdfium2/_cli/arrange.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
 import pypdfium2._helpers as pdfium
-# TODO? consider dotted access
 from pypdfium2._cli._parsers import parse_numtext
 
 
diff --git a/src/pypdfium2/_cli/attachments.py b/src/pypdfium2/_cli/attachments.py
index 039c58cd4..1536aa615 100644
--- a/src/pypdfium2/_cli/attachments.py
+++ b/src/pypdfium2/_cli/attachments.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
 from pathlib import Path
-# TODO? consider dotted access
 from pypdfium2._cli._parsers import (
     add_input, get_input,
     parse_numtext,
diff --git a/src/pypdfium2/_cli/extract_images.py b/src/pypdfium2/_cli/extract_images.py
index 6091d3489..87e5aeaef 100644
--- a/src/pypdfium2/_cli/extract_images.py
+++ b/src/pypdfium2/_cli/extract_images.py
@@ -7,7 +7,6 @@
 from pathlib import Path
 import pypdfium2.raw as pdfium_c
 import pypdfium2._helpers as pdfium
-# TODO? consider dotted access
 from pypdfium2._cli._parsers import add_input, get_input
 
 
diff --git a/src/pypdfium2/_cli/extract_text.py b/src/pypdfium2/_cli/extract_text.py
index 738645660..360e69897 100644
--- a/src/pypdfium2/_cli/extract_text.py
+++ b/src/pypdfium2/_cli/extract_text.py
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
-# TODO? consider dotted access
 from pypdfium2._cli._parsers import add_input, get_input
 
 EXTRACT_RANGE   = "range"
diff --git a/src/pypdfium2/_cli/pdfinfo.py b/src/pypdfium2/_cli/pdfinfo.py
index f4daffc03..f8dbd0011 100644
--- a/src/pypdfium2/_cli/pdfinfo.py
+++ b/src/pypdfium2/_cli/pdfinfo.py
@@ -3,7 +3,6 @@
 
 import pypdfium2.raw as pdfium_c
 import pypdfium2.internal as pdfium_i
-# TODO? consider dotted access
 from pypdfium2._cli._parsers import (
     add_input,
     add_n_digits,
diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 7d3977f64..2236d346a 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -17,7 +17,6 @@
 import pypdfium2._helpers as pdfium
 import pypdfium2.internal as pdfium_i
 import pypdfium2.raw as pdfium_r
-# TODO? consider dotted access
 from pypdfium2._cli._parsers import (
     add_input, get_input,
     setup_logging,
@@ -240,13 +239,11 @@ def _render_parallel_job(i):
     global ProcObjs; _render_job(i, *ProcObjs)
 
 
+# TODO turn into a python-usable API yielding output paths as they are written
 def main(args):
     
-    # TODO turn into a python-usable API yielding output paths as they are written
-    
     pdf = get_input(args, init_forms=args.draw_forms)
     
-    # TODO move to parsers?
     pdf_len = len(pdf)
     if not all(0 <= i < pdf_len for i in args.pages):
         raise ValueError("Out-of-bounds page indices are prohibited.")
@@ -309,7 +306,6 @@ def main(args):
         logger.info("Parallel rendering ...")
         
         ctx = mp.get_context(args.parallel_strategy)
-        # TODO unify using mp.pool.Pool(context=...) ?
         pool_backends = dict(
             mp = (ctx.Pool, "imap"),
             ft = (functools.partial(ft.ProcessPoolExecutor, mp_context=ctx), "map"),
diff --git a/src/pypdfium2/_cli/tile.py b/src/pypdfium2/_cli/tile.py
index 82d2c1e67..975648160 100644
--- a/src/pypdfium2/_cli/tile.py
+++ b/src/pypdfium2/_cli/tile.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 import pypdfium2.raw as pdfium_c
 import pypdfium2._helpers as pdfium
-# TODO? consider dotted access
 from pypdfium2._cli._parsers import add_input, get_input
 
 
diff --git a/src/pypdfium2/_cli/toc.py b/src/pypdfium2/_cli/toc.py
index 5425a33ea..79511cc85 100644
--- a/src/pypdfium2/_cli/toc.py
+++ b/src/pypdfium2/_cli/toc.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
 import pypdfium2.internal as pdfium_i
-# TODO? consider dotted access
 from pypdfium2._cli._parsers import (
     add_input,
     add_n_digits,

From 87a65479a3ae3d9cc4fe7c3e62152a32d28483b3 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 6 May 2024 16:06:01 +0200
Subject: [PATCH 063/140] Prepare for future release

---
 .github/workflows/trigger_conda_raw.yaml |  7 +++----
 .github/workflows/trigger_main.yaml      | 11 +++++------
 autorelease/config.json                  |  6 +++---
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/trigger_conda_raw.yaml b/.github/workflows/trigger_conda_raw.yaml
index e1484294b..b8e192ba9 100644
--- a/.github/workflows/trigger_conda_raw.yaml
+++ b/.github/workflows/trigger_conda_raw.yaml
@@ -3,10 +3,9 @@
 
 name: Trigger conda_raw release
 on:
-  # NOTE temporarily commented out, awaiting merge of the v5 branch
-  # schedule:
-  #   # pdfium-binaries triggers conda on the first Monday of month at 4 o'clock UTC, so we'll want to rebuild after that, but before the next main release where we want to use the package
-  #   - cron: '0 4 8 * *'  # monthly, 8th day
+  schedule:
+    # pdfium-binaries triggers conda on the first Monday of month at 4 o'clock UTC, so we'll want to rebuild after that, but before the next main release where we want to use the package
+    - cron: '0 4 8 * *'  # monthly, 8th day
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/trigger_main.yaml b/.github/workflows/trigger_main.yaml
index 01c1b755e..63cf7f499 100644
--- a/.github/workflows/trigger_main.yaml
+++ b/.github/workflows/trigger_main.yaml
@@ -5,12 +5,11 @@
 
 name: Trigger main release
 on:
-  # NOTE temporarily commented out, awaiting merge of the v5 branch
-  # # https://github.com/bblanchon/pdfium-binaries/blob/master/.github/workflows/trigger.yml
-  # # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule
-  # # https://crontab.guru/
-  # schedule:
-  #   - cron: '0 4 10 * *'  # monthly, 10th day
+  # https://github.com/bblanchon/pdfium-binaries/blob/master/.github/workflows/trigger.yml
+  # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule
+  # https://crontab.guru/
+  schedule:
+    - cron: '0 4 10 * *'  # monthly, 10th day
   workflow_dispatch:
 
 jobs:
diff --git a/autorelease/config.json b/autorelease/config.json
index 7f0e24f9f..7e2da1c10 100644
--- a/autorelease/config.json
+++ b/autorelease/config.json
@@ -1,4 +1,4 @@
 {
-  "beta": false,
-  "major": false
-}
\ No newline at end of file
+  "beta": true,
+  "major": true
+}

From 1886f89b0ce6c8bbd465abd92ea14cedbed244f0 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 6 May 2024 16:30:22 +0200
Subject: [PATCH 064/140] retain get_text_range() check for now

also move up get_text_bounded for docs
---
 docs/devel/changelog_staging.md    |  4 +-
 src/pypdfium2/_helpers/textpage.py | 71 +++++++++++++++---------------
 2 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 2c2600df2..88aff4c62 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -5,7 +5,7 @@
 
 # Changelog for next release
 
-*API-breaking changes*
+*API changes*
 - Rendering / Bitmap
   * Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). Instead, use `PdfPage.render()` with a loop or process pool.
   * Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`.
@@ -16,6 +16,7 @@
   * Renamed `PdfImage.get_size()` to `.get_px_size()`.
   * `PdfImage.extract()`: Removed `fb_render` param because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place.
 - `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest.
+- `get_text_range()`: Restored pre-v4.28 behavior, as pdfium reverted `FPDFText_GetText()` to UCS-2. Removed implicit translation of default calls to `get_text_bounded()`. However, the latter should be preferred due to full Unicode support.
 - Removed legacy version flags.
 
 *Improvements and new features*
@@ -24,7 +25,6 @@
 - Exposed `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added check and updated docs accordingly.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
 - If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype.
-- Restored `get_text_range()` to its pre-v4.28 behavior, as pdfium reverted `FPDFText_GetText()` to UCS-2.
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
 - Simplified version impl (no API change expected).
 
diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py
index 6b67f0c3e..d7eb53779 100644
--- a/src/pypdfium2/_helpers/textpage.py
+++ b/src/pypdfium2/_helpers/textpage.py
@@ -36,6 +36,38 @@ def parent(self):  # AutoCloseable hook
         return self.page
     
     
+    def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore"):
+        """
+        Extract text from given boundaries in PDF coordinates.
+        If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
+        
+        Parameters:
+            errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
+        Returns:
+            str: The text on the page area in question, or an empty string if no text was found.
+        """
+        
+        bbox = self.page.get_bbox()
+        if left is None:
+            left = bbox[0]
+        if bottom is None:
+            bottom = bbox[1]
+        if right is None:
+            right = bbox[2]
+        if top is None:
+            top = bbox[3]
+        
+        args = (self, left, top, right, bottom)
+        n_chars = pdfium_c.FPDFText_GetBoundedText(*args, None, 0)
+        if n_chars <= 0:
+            return ""
+        
+        buffer = ctypes.create_string_buffer(n_chars * 2)
+        buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))
+        pdfium_c.FPDFText_GetBoundedText(*args, buffer_ptr, n_chars)
+        return buffer.raw.decode("utf-16-le", errors=errors)
+    
+    
     def _get_active_text_range(self, c_start, c_end, l_passive=0, r_passive=0):
         
         if c_start > c_end:
@@ -56,6 +88,9 @@ def get_text_range(self, index=0, count=-1, errors="ignore"):
         """
         Extract text from a given range.
         
+        Warning:
+            This method is limited to UCS-2, whereas :meth:`.get_text_bounded` provides full Unicode support.
+        
         Parameters:
             index (int): Index of the first char to include.
             count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*.
@@ -86,11 +121,9 @@ def get_text_range(self, index=0, count=-1, errors="ignore"):
         count -= l_passive + r_passive
         in_count = t_end+1 - t_start
         
-        # pdfium fea01fa9e2 to d6a4b27d80 requires assuming 4 bytes per character
-        # this corresponds to approx. >6167,<6415 or pdfium-binaries >=6191,<=6406
+        # pdfium builds from fea01fa9e2 (>6167) to d6a4b27d80 (<6415) require assuming 4 bytes per character
         # https://github.com/pypdfium2-team/pypdfium2/issues/298
         # https://crbug.com/pdfium/2133
-        # -> NOTE(geisserml) may be removed once pdfium-binaries > d6a4b27d80 is released
         if 6167 < PDFIUM_INFO.build < 6415:
             in_count *= 2
         in_count += 1  # null terminator
@@ -103,38 +136,6 @@ def get_text_range(self, index=0, count=-1, errors="ignore"):
         return buffer.raw[:(out_count-1)*2].decode("utf-16-le", errors=errors)
     
     
-    def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore"):
-        """
-        Extract text from given boundaries in PDF coordinates.
-        If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
-        
-        Parameters:
-            errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
-        Returns:
-            str: The text on the page area in question, or an empty string if no text was found.
-        """
-        
-        bbox = self.page.get_bbox()
-        if left is None:
-            left = bbox[0]
-        if bottom is None:
-            bottom = bbox[1]
-        if right is None:
-            right = bbox[2]
-        if top is None:
-            top = bbox[3]
-        
-        args = (self, left, top, right, bottom)
-        n_chars = pdfium_c.FPDFText_GetBoundedText(*args, None, 0)
-        if n_chars <= 0:
-            return ""
-        
-        buffer = ctypes.create_string_buffer(n_chars * 2)
-        buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))
-        pdfium_c.FPDFText_GetBoundedText(*args, buffer_ptr, n_chars)
-        return buffer.raw.decode("utf-16-le", errors=errors)
-    
-    
     def count_chars(self):
         """
         Returns:

From 027b909f7792d61534bfb28b0e4365921af10376 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 6 May 2024 16:36:32 +0200
Subject: [PATCH 065/140] round off docs for `PdfBitmap.new_native()`

---
 src/pypdfium2/_helpers/bitmap.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index 43f297dde..df67c0599 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -120,11 +120,12 @@ def new_native(cls, width, height, format, rev_byteorder=False, buffer=None, str
         """
         Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by Python/ctypes, or provided by the caller.
         
-        If buffer and stride are None, a packed buffer is created.
-        If buffer is None but a custom stride is given, a stride-agnostic buffer is created.
-        If both custom buffer and stride are given, they are used as-is.
+        * If buffer and stride are None, a packed buffer is created.
+        * If a custom buffer is given but no stride, the buffer is assumed to be packed.
+        * If a custom stride is given but no buffer, a stride-agnostic buffer is created.
+        * If both custom buffer and stride are given, they are used as-is.
         
-        Caller-provided buffers or strides are subject to a logical validation.
+        Caller-provided buffer/stride are subject to a logical validation.
         """
         
         bpc = pdfium_i.BitmapTypeToNChannels[format]

From 2f135e6cd5deb81f73e92806ecefa86cc3007eec Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 7 May 2024 00:20:53 +0200
Subject: [PATCH 066/140] PdfImage.extract(): fix for filenames containing
 non-extension dot

---
 src/pypdfium2/_helpers/pageobjects.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index d1be171e0..d7c6162f2 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -360,10 +360,10 @@ def extract(self, dest, *args, **kwargs):
         extraction_gen = _extract_smart(self, *args, **kwargs)
         format = next(extraction_gen)
         
-        if isinstance(dest, str):
-            dest = Path(dest)
         if isinstance(dest, Path):
-            with open(dest.with_suffix("."+format), "wb") as buf:
+            dest = str(dest)
+        if isinstance(dest, str):
+            with open(f"{dest}.{format}", "wb") as buf:
                 extraction_gen.send(buf)
         elif pdfium_i.is_buffer(dest, "w"):
             extraction_gen.send(dest)

From cdc0c06926f8361285be75029151f4d3fea95745 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 7 May 2024 00:22:05 +0200
Subject: [PATCH 067/140] CLI/extract-images: increase default recursion depth

Increase XObject recursion depth to 15 to be on the safe side of
capturing all images.
---
 src/pypdfium2/_cli/extract_images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pypdfium2/_cli/extract_images.py b/src/pypdfium2/_cli/extract_images.py
index 87e5aeaef..3a21b6df4 100644
--- a/src/pypdfium2/_cli/extract_images.py
+++ b/src/pypdfium2/_cli/extract_images.py
@@ -21,7 +21,7 @@ def attach(parser):
     parser.add_argument(
         "--max-depth",
         type = int,
-        default = 2,
+        default = 15,
         help = "Maximum recursion depth to consider when looking for page objects.",
     )
     parser.add_argument(

From 0f0dfb190f813b5837e2f2ba360d1858a8a47e4e Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 7 May 2024 00:28:59 +0200
Subject: [PATCH 068/140] update changelog

---
 docs/devel/changelog_staging.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 88aff4c62..e8ab03667 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -19,6 +19,10 @@
 - `get_text_range()`: Restored pre-v4.28 behavior, as pdfium reverted `FPDFText_GetText()` to UCS-2. Removed implicit translation of default calls to `get_text_bounded()`. However, the latter should be preferred due to full Unicode support.
 - Removed legacy version flags.
 
+*Bug fixes*
+- Fixed blunder in `PdfImage.extract()` producing an incorrect output path for prefixes containing a dot. In the `extract-images` CLI, this caused all output images of a type to be written to the same path for a document containing a non-extension dot in the filename.
+- XFA / rendering CLI: Fixed incorrect recognition of document length. `pdf.init_forms()` must be called before `len(pdf)`.
+
 *Improvements and new features*
 - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates.
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
@@ -28,9 +32,6 @@
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
 - Simplified version impl (no API change expected).
 
-*Bug fixes*
-- XFA / rendering CLI: Fixed incorrect recognition of document length. `pdf.init_forms()` must be called before `len(pdf)`.
-
 *Project*
 - Merged `tests_old/` back into `tests/`.
 - Docs: Improved logic when to include the unreleased version warning and upcoming changelog.

From 37bde6447121f853653bd94e6fea9e64325fe3f6 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 7 May 2024 00:20:53 +0200
Subject: [PATCH 069/140] PdfImage.extract(): fix for filenames containing
 non-extension dot

Cherry-picked from devel_new
---
 src/pypdfium2/_helpers/pageobjects.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 2be708f1a..7941e6e84 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -334,10 +334,10 @@ def extract(self, dest, *args, **kwargs):
         extraction_gen = _extract_smart(self, *args, **kwargs)
         format = next(extraction_gen)
         
-        if isinstance(dest, str):
-            dest = Path(dest)
         if isinstance(dest, Path):
-            with open(dest.with_suffix("."+format), "wb") as buf:
+            dest = str(dest)
+        if isinstance(dest, str):
+            with open(f"{dest}.{format}", "wb") as buf:
                 extraction_gen.send(buf)
         elif pdfium_i.is_buffer(dest, "w"):
             extraction_gen.send(dest)

From 863d85dd730612e34bbfeccd7212b0dc3ee30a03 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 7 May 2024 00:37:47 +0200
Subject: [PATCH 070/140] get_text_range(): adapt allocation to pdfium version

backported from devel_new branch
---
 src/pypdfium2/_helpers/textpage.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py
index b1303fecd..9ba875f73 100644
--- a/src/pypdfium2/_helpers/textpage.py
+++ b/src/pypdfium2/_helpers/textpage.py
@@ -9,6 +9,7 @@
 import pypdfium2.raw as pdfium_c
 import pypdfium2.internal as pdfium_i
 from pypdfium2._helpers.misc import PdfiumError
+from pypdfium2.version import PDFIUM_INFO
 
 c_double = ctypes.c_double
 
@@ -94,7 +95,14 @@ def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False):
         t_start, t_end, l_passive, r_passive = active_range
         index += l_passive
         count -= l_passive + r_passive
-        in_count = (t_end+1 - t_start)*2 + 1
+        in_count = t_end+1 - t_start
+        
+        # pdfium fea01fa9e2 (>6167) to d6a4b27d80 (<6415) requires assuming 4 bytes per character
+        # https://github.com/pypdfium2-team/pypdfium2/issues/298
+        # https://crbug.com/pdfium/2133
+        if 6167 < PDFIUM_INFO.build < 6415:
+            in_count *= 2
+        in_count += 1  # null terminator
         
         buffer = ctypes.create_string_buffer(in_count * 2)
         buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))

From 555ba5e3299265526884993941ddbd5cb7f22957 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 7 May 2024 16:08:01 +0200
Subject: [PATCH 071/140] PdfImage.extract(): slightly simplify path handling

str and path can be embedded in an f-string equally, so do it in one
clause
---
 src/pypdfium2/_helpers/pageobjects.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index d7c6162f2..e89b7ca17 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -360,9 +360,7 @@ def extract(self, dest, *args, **kwargs):
         extraction_gen = _extract_smart(self, *args, **kwargs)
         format = next(extraction_gen)
         
-        if isinstance(dest, Path):
-            dest = str(dest)
-        if isinstance(dest, str):
+        if isinstance(dest, (str, Path)):
             with open(f"{dest}.{format}", "wb") as buf:
                 extraction_gen.send(buf)
         elif pdfium_i.is_buffer(dest, "w"):

From c9115bd0ef378a932b154a53c047196da99eed0e Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 7 May 2024 16:24:13 +0200
Subject: [PATCH 072/140] slightly simplify get_filters(skip_simple=True)

expunge simple filters afterwards to avoid re-checking skip_simple
---
 src/pypdfium2/_helpers/pageobjects.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index e89b7ca17..ae51a6204 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -330,10 +330,11 @@ def get_filters(self, skip_simple=False):
             buffer = ctypes.create_string_buffer(length)
             pdfium_c.FPDFImageObj_GetImageFilter(self, i, buffer, length)
             f = buffer.value.decode("utf-8")
-            if skip_simple and f in self.SIMPLE_FILTERS:
-                continue
             filters.append(f)
         
+        if skip_simple:
+            filters = [f for f in filters if f not in self.SIMPLE_FILTERS]
+        
         return filters
     
     

From 247873feefe1e5a4c000f93a3211a03581463a53 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 9 May 2024 19:13:58 +0200
Subject: [PATCH 073/140] Update changelog according to backports

---
 docs/devel/changelog_staging.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index e8ab03667..090507e1f 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -16,13 +16,9 @@
   * Renamed `PdfImage.get_size()` to `.get_px_size()`.
   * `PdfImage.extract()`: Removed `fb_render` param because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place.
 - `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest.
-- `get_text_range()`: Restored pre-v4.28 behavior, as pdfium reverted `FPDFText_GetText()` to UCS-2. Removed implicit translation of default calls to `get_text_bounded()`. However, the latter should be preferred due to full Unicode support.
+- `get_text_range()`: Removed implicit translation of default calls to `get_text_bounded()`, as pdfium reverted `FPDFText_GetText()` to UCS-2, which resolves the allocation concern. However, callers are encouraged to explicitly use `get_text_bounded()` for full Unicode support.
 - Removed legacy version flags.
 
-*Bug fixes*
-- Fixed blunder in `PdfImage.extract()` producing an incorrect output path for prefixes containing a dot. In the `extract-images` CLI, this caused all output images of a type to be written to the same path for a document containing a non-extension dot in the filename.
-- XFA / rendering CLI: Fixed incorrect recognition of document length. `pdf.init_forms()` must be called before `len(pdf)`.
-
 *Improvements and new features*
 - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates.
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.

From 72b60ed9dbdf278e6653426d632cae35ecefb6c0 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 10 May 2024 21:00:04 +0200
Subject: [PATCH 074/140] consts: clean up comment

version classes are no longer deferred because it caused too much bloat
---
 src/pypdfium2/internal/consts.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pypdfium2/internal/consts.py b/src/pypdfium2/internal/consts.py
index 28c050c36..0b95d123c 100644
--- a/src/pypdfium2/internal/consts.py
+++ b/src/pypdfium2/internal/consts.py
@@ -126,9 +126,8 @@ def get(self, key, default_prefix="Unhandled constant"):
 })
 
 
-# known implication: causes eager evaluation of pdfium version
 if "XFA" in PDFIUM_INFO.flags:
-    #: [V8/XFA builds only] Convert a PDFium XFA error constant (:attr:`FPDF_ERR_XFA*`) to string.
+    #: [XFA builds only] Convert a PDFium XFA error constant (:attr:`FPDF_ERR_XFA*`) to string.
     XFAErrorToStr = _fallback_dict({
         pdfium_c.FPDF_ERR_XFALOAD:   "Load error",
         pdfium_c.FPDF_ERR_XFALAYOUT: "Layout error",

From 1eab5cbbab9f3ca0a4399103ad6c91b2197ed0dc Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 13 May 2024 18:16:57 +0200
Subject: [PATCH 075/140] PdfPage.get_objects(): increase default recursion
 depth

to align with the extract-images CLI
---
 src/pypdfium2/_helpers/page.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index d34322be0..55acb1e34 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -253,7 +253,7 @@ def gen_content(self):
             raise PdfiumError("Failed to generate page content.")
     
     
-    def get_objects(self, filter=None, max_depth=2, form=None, level=0):
+    def get_objects(self, filter=None, max_depth=15, form=None, level=0):
         """
         Iterate through the page objects on this page.
         

From ca9c964fb18fff38884d3a4e79e88e9ea2c36b3f Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 13 May 2024 18:15:42 +0200
Subject: [PATCH 076/140] sligthly update docs for PdfImage.extract() again

---
 src/pypdfium2/_helpers/pageobjects.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index ae51a6204..45926481f 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -343,15 +343,18 @@ def extract(self, dest, *args, **kwargs):
         Extract the image into an independently usable file or byte buffer, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits.
         
         This method can only extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly.
-        Otherwise, the pixel data is decoded, and re-encoded using :mod:`PIL`.
+        Otherwise, the pixel data is decoded and re-encoded using :mod:`PIL`, which is slower and loses the original encoding.
         For images with simple filters only, ``get_data(decode_simple=True)`` is used to preserve higher bit depth or special color formats not supported by ``FPDF_BITMAP``.
-        For images with complex filters, we have to resort to :meth:`.get_bitmap`, which can be a lossy operation.
+        For images with complex filters other than those extracted directly, we have to resort to :meth:`.get_bitmap`.
         
-        Note, this method ignores alpha masks, and potentially other data stored separately of the main data stream, which might lead to incorrect representation of the image.
+        Note, this method is not able to account for alpha masks, and potentially other data stored separately of the main image stream, which might lead to incorrect representation of the image.
+        
+        Tip:
+            The ``pikepdf`` library is capable of preserving the original encoding in many cases where this method is not.
         
         Parameters:
             dest (str | pathlib.Path | io.BytesIO):
-                File prefix or byte buffer to which the image shall be written.
+                File path prefix or byte buffer to which the image shall be written.
             fb_format (str):
                 The image format to use in case it is necessary to (re-)encode the data.
         """

From f75e075c7a39fa29d3edbc00dd46dc7c1e27b374 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 13 May 2024 19:31:00 +0200
Subject: [PATCH 077/140] Add warning about textpage handles when removing text
 objects

See https://pdfium-review.googlesource.com/c/pdfium/+/118914
---
 src/pypdfium2/_helpers/page.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 55acb1e34..aa6065d14 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -228,6 +228,9 @@ def remove_obj(self, pageobj):
         As of PDFium 5692, detached page objects may be only re-inserted into existing pages of the same document.
         If the page object is not re-inserted into a page, its ``close()`` method may be called.
         
+        Caution:
+            If the object's :attr:`~.PdfObject.type` is :data:`FPDF_PAGEOBJ_TEXT`, all :class:`.PdfTextPage` handles ought to be closed before removing the object.
+        
         Parameters:
             pageobj (PdfObject): The page object to remove.
         """

From bc8e18c59efe517e9e946c6c3559025291b0af7e Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 13 May 2024 19:32:17 +0200
Subject: [PATCH 078/140] Explain PdfObject.close()

---
 src/pypdfium2/_helpers/pageobjects.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 45926481f..2316ae0e6 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -25,6 +25,10 @@ class PdfObject (pdfium_i.AutoCloseable):
     
     When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead, depending on the object's :attr:`.type` (e. g. :class:`.PdfImage`).
     
+    Note:
+        :meth:`.PdfObject.close` only takes effect on loose pageobjects.
+        It is a no-op otherwise, because pageobjects that are part of a page are owned by pdfium, not the caller.
+    
     Attributes:
         raw (FPDF_PAGEOBJECT):
             The underlying PDFium pageobject handle.

From 9a0221462378af4416ddba21c0cac85feeb5dba4 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 13 May 2024 20:03:45 +0200
Subject: [PATCH 079/140] Autoclose textpage handles when removing text
 pageobject

---
 src/pypdfium2/_helpers/page.py | 14 ++++++++++++--
 tests/test_textpage.py         | 23 +++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index aa6065d14..0e9b24568 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -35,6 +35,7 @@ def __init__(self, raw, pdf, formenv):
         self.raw = raw
         self.pdf = pdf
         self.formenv = formenv
+        self._textpage_wrefs = []
         super().__init__(PdfPage._close_impl, self.formenv)
     
     
@@ -195,6 +196,7 @@ def get_textpage(self):
             raise PdfiumError("Failed to load text page.")
         textpage = PdfTextPage(raw_textpage, self)
         self._add_kid(textpage)
+        self._textpage_wrefs.append( weakref.ref(textpage) )
         return textpage
     
     
@@ -228,8 +230,8 @@ def remove_obj(self, pageobj):
         As of PDFium 5692, detached page objects may be only re-inserted into existing pages of the same document.
         If the page object is not re-inserted into a page, its ``close()`` method may be called.
         
-        Caution:
-            If the object's :attr:`~.PdfObject.type` is :data:`FPDF_PAGEOBJ_TEXT`, all :class:`.PdfTextPage` handles ought to be closed before removing the object.
+        Note:
+            If the object's :attr:`~.PdfObject.type` is :data:`FPDF_PAGEOBJ_TEXT`, any :class:`.PdfTextPage` handles to the page should be closed before removing the object.
         
         Parameters:
             pageobj (PdfObject): The page object to remove.
@@ -238,6 +240,14 @@ def remove_obj(self, pageobj):
         if pageobj.page is not self:
             raise ValueError("The page object you attempted to remove is not part of this page.")
         
+        # https://pdfium-review.googlesource.com/c/pdfium/+/118914
+        if pageobj.type == pdfium_c.FPDF_PAGEOBJ_TEXT:
+            for wref in self._textpage_wrefs:
+                textpage = wref()
+                if textpage and textpage.raw:
+                    logger.warning(f"When removing a text pageobject, any textpage handles ought to be closed beforehand - auto-closing {textpage}.")
+                    textpage.close()
+        
         ok = pdfium_c.FPDFPage_RemoveObject(self, pageobj)
         if not ok:
             raise PdfiumError("Failed to remove pageobject.")
diff --git a/tests/test_textpage.py b/tests/test_textpage.py
index eeacc9e33..be21a5386 100644
--- a/tests/test_textpage.py
+++ b/tests/test_textpage.py
@@ -3,7 +3,9 @@
 
 import re
 import pytest
+import logging
 import pypdfium2 as pdfium
+import pypdfium2.raw as pdfium_c
 from .conftest import TestFiles
 
 
@@ -152,3 +154,24 @@ def test_get_text_bounded_defaults_with_rotation():
     
     text = textpage.get_text_bounded()
     assert len(text) == 438
+
+
+@pytest.mark.parametrize("explicit_close", [False, True])
+def test_autoclose_with_remove_obj(caplog, explicit_close):
+    
+    pdf = pdfium.PdfDocument(TestFiles.text)
+    page = pdf[0]
+    textobj = next( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_TEXT]) )
+    assert len(page._textpage_wrefs) == 0
+    textpage = page.get_textpage()
+    assert len(page._textpage_wrefs) == 1
+    
+    if explicit_close:
+        textpage.close()
+    with caplog.at_level(logging.WARNING):
+        page.remove_obj(textobj)
+    
+    if explicit_close:
+        assert not caplog.text
+    else:
+        assert f"When removing a text pageobject, any textpage handles ought to be closed beforehand - auto-closing {textpage}." in caplog.text

From e38085f72c4233d8cb25df2a6c5d3b625e5c544e Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 13 May 2024 20:15:52 +0200
Subject: [PATCH 080/140] Add some tasks regarding AutoCloseable.close()

---
 src/pypdfium2/internal/bases.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py
index 51ae38fcb..8351ecc17 100644
--- a/src/pypdfium2/internal/bases.py
+++ b/src/pypdfium2/internal/bases.py
@@ -92,6 +92,9 @@ def _add_kid(self, k):
     
     def close(self, _by_parent=False):
         
+        # TODO invalidate self.raw if closing object without finalizer (supposedly, when closing a page, child pageobject handles fall invalid)
+        # TODO remove object from parent's kids cache on finalization to avoid unnecessary accumulation (also for PdfPage._textpage_wrefs)
+        
         if not self.raw or not self._finalizer:
             return False
         

From d23268948e3e240fdd2c2bb7355a5861f99b9bde Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 14 May 2024 00:06:26 +0200
Subject: [PATCH 081/140] Consistently call `PdfObject` `pageobject` in docs

"page object" is slightly unclear - it might be either an object on a
page (PdfObject), or an instance of PdfPage.
Therefore, call PdfObject "pageobject" (without space) to somewhat
outline the difference.
---
 docs/source/python_api.rst            |  4 ++--
 docs/source/shell_api.rst             |  4 ++--
 src/pypdfium2/__main__.py             |  2 +-
 src/pypdfium2/_cli/extract_images.py  |  2 +-
 src/pypdfium2/_helpers/document.py    |  4 ++--
 src/pypdfium2/_helpers/page.py        | 28 +++++++++++++--------------
 src/pypdfium2/_helpers/pageobjects.py |  8 ++++----
 7 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/docs/source/python_api.rst b/docs/source/python_api.rst
index 9e2c12399..7ca4182b2 100644
--- a/docs/source/python_api.rst
+++ b/docs/source/python_api.rst
@@ -84,8 +84,8 @@ Page
 ****
 .. automodule:: pypdfium2._helpers.page
 
-Page Objects
-************
+Pageobjects
+***********
 .. automodule:: pypdfium2._helpers.pageobjects
 
 Text Page
diff --git a/docs/source/shell_api.rst b/docs/source/shell_api.rst
index adaba15a8..d9cc9fc0a 100644
--- a/docs/source/shell_api.rst
+++ b/docs/source/shell_api.rst
@@ -46,8 +46,8 @@ Image Converter
 .. command-output:: pypdfium2 imgtopdf --help
 
 
-Page Objects Info
-*****************
+Pageobjects Info
+****************
 .. command-output:: pypdfium2 pageobjects --help
 
 
diff --git a/src/pypdfium2/__main__.py b/src/pypdfium2/__main__.py
index 55e5f1826..0bd7e580b 100644
--- a/src/pypdfium2/__main__.py
+++ b/src/pypdfium2/__main__.py
@@ -16,7 +16,7 @@
     "extract-images": "extract images",
     "extract-text":   "extract text",
     "imgtopdf":       "convert images to PDF",
-    "pageobjects":    "print info on page objects",
+    "pageobjects":    "print info on pageobjects",
     "pdfinfo":        "print info on document and pages",
     "render":         "rasterize pages",
     "tile":           "tile pages (N-up)",
diff --git a/src/pypdfium2/_cli/extract_images.py b/src/pypdfium2/_cli/extract_images.py
index 3a21b6df4..df4e18e65 100644
--- a/src/pypdfium2/_cli/extract_images.py
+++ b/src/pypdfium2/_cli/extract_images.py
@@ -22,7 +22,7 @@ def attach(parser):
         "--max-depth",
         type = int,
         default = 15,
-        help = "Maximum recursion depth to consider when looking for page objects.",
+        help = "Maximum recursion depth to consider when looking for pageobjects.",
     )
     parser.add_argument(
         "--use-bitmap",
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 0ab49b85c..4a379def6 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -607,8 +607,8 @@ def parent(self):  # AutoCloseable hook
     def as_pageobject(self):
         """
         Returns:
-            PdfObject: An independent page object representation of the XObject.
-            If multiple page objects are created from one XObject, they share resources.
+            PdfObject: An independent pageobject representation of the XObject.
+            If multiple pageobjects are created from one XObject, they share resources.
             Pageobjects created from an XObject remain valid after the XObject is closed.
         """
         raw_pageobj = pdfium_c.FPDF_NewFormObjectFromXObject(self)
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 0e9b24568..518e4e497 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -202,15 +202,15 @@ def get_textpage(self):
     
     def insert_obj(self, pageobj):
         """
-        Insert a page object into the page.
+        Insert a pageobject into the page.
         
-        The page object must not belong to a page yet. If it belongs to a PDF, this page must be part of the PDF.
+        The pageobject must not belong to a page yet. If it belongs to a PDF, this page must be part of the PDF.
         
         Position and form are defined by the object's matrix.
         If it is the identity matrix, the object will appear as-is on the bottom left corner of the page.
         
         Parameters:
-            pageobj (PdfObject): The page object to insert.
+            pageobj (PdfObject): The pageobject to insert.
         """
         
         if pageobj.page:
@@ -226,19 +226,19 @@ def insert_obj(self, pageobj):
     
     def remove_obj(self, pageobj):
         """
-        Remove a page object from the page.
-        As of PDFium 5692, detached page objects may be only re-inserted into existing pages of the same document.
-        If the page object is not re-inserted into a page, its ``close()`` method may be called.
+        Remove a pageobject from the page.
+        As of PDFium 5692, detached pageobjects may be only re-inserted into existing pages of the same document.
+        If the pageobject is not re-inserted into a page, its ``close()`` method may be called.
         
         Note:
             If the object's :attr:`~.PdfObject.type` is :data:`FPDF_PAGEOBJ_TEXT`, any :class:`.PdfTextPage` handles to the page should be closed before removing the object.
         
         Parameters:
-            pageobj (PdfObject): The page object to remove.
+            pageobj (PdfObject): The pageobject to remove.
         """
                 
         if pageobj.page is not self:
-            raise ValueError("The page object you attempted to remove is not part of this page.")
+            raise ValueError("The pageobject you attempted to remove is not part of this page.")
         
         # https://pdfium-review.googlesource.com/c/pdfium/+/118914
         if pageobj.type == pdfium_c.FPDF_PAGEOBJ_TEXT:
@@ -257,7 +257,7 @@ def remove_obj(self, pageobj):
     
     def gen_content(self):
         """
-        Generate page content to apply additions, removals or modifications of page objects.
+        Generate page content to apply additions, removals or modifications of pageobjects.
         
         If page content was changed, this function should be called once before saving the document or re-loading the page.
         """
@@ -268,18 +268,18 @@ def gen_content(self):
     
     def get_objects(self, filter=None, max_depth=15, form=None, level=0):
         """
-        Iterate through the page objects on this page.
+        Iterate through the pageobjects on this page.
         
         Parameters:
             filter (list[int] | None):
-                An optional list of page object types to filter (:attr:`FPDF_PAGEOBJ_*`).
+                An optional list of pageobject types to filter (:attr:`FPDF_PAGEOBJ_*`).
                 Any objects whose type is not contained will be skipped.
                 If None or empty, all objects will be provided, regardless of their type.
             max_depth (int):
                 Maximum recursion depth to consider when descending into Form XObjects.
         
         Yields:
-            :class:`.PdfObject`: A page object.
+            :class:`.PdfObject`: A pageobject.
         """
         
         # TODO close skipped objects explicitly ?
@@ -295,13 +295,13 @@ def get_objects(self, filter=None, max_depth=15, form=None, level=0):
         
         n_objects = count_objects(parent)
         if n_objects < 0:
-            raise PdfiumError("Failed to get number of page objects.")
+            raise PdfiumError("Failed to get number of pageobjects.")
         
         for i in range(n_objects):
             
             raw_obj = get_object(parent, i)
             if not raw_obj:
-                raise PdfiumError("Failed to get page object.")
+                raise PdfiumError("Failed to get pageobject.")
             
             helper_obj = PdfObject(raw_obj, page=self, pdf=self.pdf, level=level)
             self._add_kid(helper_obj)
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 2316ae0e6..6f3f66e1d 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -21,7 +21,7 @@
 
 class PdfObject (pdfium_i.AutoCloseable):
     """
-    Page object helper class.
+    Pageobject helper class.
     
     When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead, depending on the object's :attr:`.type` (e. g. :class:`.PdfImage`).
     
@@ -146,14 +146,14 @@ def set_matrix(self, matrix):
     def transform(self, matrix):
         """
         Parameters:
-            matrix (PdfMatrix): Multiply the page object's current transform matrix by this matrix.
+            matrix (PdfMatrix): Multiply the pageobject's current transform matrix by this matrix.
         """
         pdfium_c.FPDFPageObj_Transform(self, *matrix.get())
 
 
 class PdfImage (PdfObject):
     """
-    Image object helper class (specific kind of page object).
+    Image object helper class (specific kind of pageobject).
     """
     
     # cf. https://crbug.com/pdfium/1203
@@ -287,7 +287,7 @@ def get_bitmap(self, render=False):
         
         if render:
             if self.pdf is None:
-                raise RuntimeError("Cannot get rendered bitmap of loose page object.")
+                raise RuntimeError("Cannot get rendered bitmap of loose pageobject.")
             raw_bitmap = pdfium_c.FPDFImageObj_GetRenderedBitmap(self.pdf, self.page, self)
         else:
             raw_bitmap = pdfium_c.FPDFImageObj_GetBitmap(self)

From 428f4c396a1e3335b3c3cc54a02d4b638ccd8fb7 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 14 May 2024 01:37:23 +0200
Subject: [PATCH 082/140] PdfiumError: don't state the obvious

---
 src/pypdfium2/_helpers/misc.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/pypdfium2/_helpers/misc.py b/src/pypdfium2/_helpers/misc.py
index 9b19e0d54..977d1148c 100644
--- a/src/pypdfium2/_helpers/misc.py
+++ b/src/pypdfium2/_helpers/misc.py
@@ -10,9 +10,6 @@ class PdfiumError (RuntimeError):
     
     Attributes:
         err_code (int | None): PDFium error code, for programmatic handling of error subtypes, if provided by the API in question (e.g. document loading). None otherwise.
-    
-    Tip:
-        Use ``str(exc)`` to get the message of a caught exception.
     """
     
     def __init__(self, msg, err_code=None):

From 7cf09d8493e5e1e11f6c657650aedc97aaf46457 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 14 May 2024 01:58:01 +0200
Subject: [PATCH 083/140] docs/conf.py: comment out namedtuple handler

We no longer have any public namedtuples after the removal of
PdfOutlineItem and PdfBitmapInfo.
---
 docs/source/conf.py                   | 14 ++++++--------
 src/pypdfium2/_helpers/pageobjects.py |  2 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1ca0449fa..e82d797a4 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -8,7 +8,7 @@
 import os
 import sys
 import time
-import collections
+# import collections
 from pathlib import Path
 
 sys.path.insert(0, str(Path(__file__).parents[2] / "setupsrc"))
@@ -59,7 +59,6 @@
     "members": True,
     "undoc-members": True,
     "show-inheritance": True,
-    # "inherited-members": True,
     "member-order": "bysource",
 }
 intersphinx_mapping = {
@@ -75,12 +74,11 @@
 .. |have_changes| replace:: {have_changes}
 """
 
-def remove_namedtuple_aliases(app, what, name, obj, skip, options):
-    if type(obj) is collections._tuplegetter:
-        return True
-    return skip
-
+# def remove_namedtuple_aliases(app, what, name, obj, skip, options):
+#     if type(obj) is collections._tuplegetter:
+#         return True
+#     return skip
 
 def setup(app):
-    app.connect('autodoc-skip-member', remove_namedtuple_aliases)
+    # app.connect('autodoc-skip-member', remove_namedtuple_aliases)
     app.add_config_value("have_changes", True, "env")
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 6f3f66e1d..5dddd3664 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -35,7 +35,7 @@ class PdfObject (pdfium_i.AutoCloseable):
         type (int):
             The object's type (:data:`FPDF_PAGEOBJ_*`).
         page (PdfPage):
-            Reference to the page this pageobject belongs to. May be None if it does not belong to a page yet.
+            Reference to the page this pageobject belongs to. May be None if not part of a page (e.g. new or detached object).
         pdf (PdfDocument):
             Reference to the document this pageobject belongs to. May be None if the object does not belong to a document yet.
             This attribute is always set if :attr:`.page` is set.

From 5c66a32085d86f5340f3a0dad8cf8cba76dae85f Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 14 May 2024 02:16:43 +0200
Subject: [PATCH 084/140] PdfBitmap: slightly improve docs for
 `new_foreign{_simple}()`

---
 src/pypdfium2/_helpers/bitmap.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index df67c0599..c2f75f519 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -149,8 +149,9 @@ def new_native(cls, width, height, format, rev_byteorder=False, buffer=None, str
     def new_foreign(cls, width, height, format, rev_byteorder=False, force_packed=False):
         """
         Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by PDFium.
+        There may be a padding of unused bytes at line end, unless *force_packed=True* is given.
         
-        Using this method is discouraged. Prefer :meth:`.new_native` instead.
+        Note that is encouraged to prefer :meth:`.new_native`.
         """
         stride = width * pdfium_i.BitmapTypeToNChannels[format] if force_packed else 0
         raw = pdfium_c.FPDFBitmap_CreateEx(width, height, format, None, stride)
@@ -160,10 +161,9 @@ def new_foreign(cls, width, height, format, rev_byteorder=False, force_packed=Fa
     @classmethod
     def new_foreign_simple(cls, width, height, use_alpha, rev_byteorder=False):
         """
-        Create a new bitmap using :func:`FPDFBitmap_Create`. The buffer is allocated by PDFium.
-        The resulting bitmap is supposed to be packed (i. e. no gap of unused bytes between lines).
+        Create a new bitmap using :func:`FPDFBitmap_Create`. The buffer is allocated by PDFium, and supposed to be packed (i. e. no gap of unused bytes between lines).
         
-        Using this method is discouraged. Prefer :meth:`.new_native` instead.
+        Note that it is encouraged to prefer :meth:`.new_native`.
         """
         raw = pdfium_c.FPDFBitmap_Create(width, height, use_alpha)
         return cls.from_raw(raw, rev_byteorder)

From c8e4a06b5b17f254c635098bc4d1197f51b4efbd Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 16 May 2024 00:44:05 +0200
Subject: [PATCH 085/140] Handle GetCharIndexAtPos() conforming with pdfium
 docs

---
 src/pypdfium2/_helpers/textpage.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py
index d7eb53779..19cdb4ac6 100644
--- a/src/pypdfium2/_helpers/textpage.py
+++ b/src/pypdfium2/_helpers/textpage.py
@@ -172,11 +172,14 @@ def get_index(self, x, y, x_tol, y_tol):
             y_tol (float): Vertical tolerance.
         Returns:
             int | None: The index of the character at or nearby the point (x, y).
-            May be None if there is no character or an error occurred.
+            May be None if there is no character. If an internal error occurred, an exception will be raised.
         """
         index = pdfium_c.FPDFText_GetCharIndexAtPos(self, x, y, x_tol, y_tol)
-        if index < 0:
+        if index == -1:
             return None
+        elif index == -3:
+            raise PdfiumError("An error occurred on attempt to get char index by pos.")
+        assert index >= 0, "Negative return is not permitted (unhandled error code?)"
         return index
     
     

From 14a7fbbf567cf0cc4926591788880a2ffd9a6975 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sat, 18 May 2024 23:32:34 +0200
Subject: [PATCH 086/140] PdfPage.get_objects(): don't register objects as kids

This was especially problematic as weakrefs are not cleaned up when the
object in question is closed/collected, so we potentially store many
dead pointers.
Imagine a caller invoking get_objects() repeatedly for iterating and a
page handle living for a long time afterwards - that somewhat resembles
a memory leak.
---
 src/pypdfium2/_helpers/page.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 518e4e497..b539850fd 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -303,8 +303,8 @@ def get_objects(self, filter=None, max_depth=15, form=None, level=0):
             if not raw_obj:
                 raise PdfiumError("Failed to get pageobject.")
             
+            # Not a child object, because the lifetime of pageobjects that are part of a page is managed by pdfium. The .page reference is enough to keep the parent alive, unless the caller explicitly closes it (which may not merit storing countless of weakrefs).
             helper_obj = PdfObject(raw_obj, page=self, pdf=self.pdf, level=level)
-            self._add_kid(helper_obj)
             if not filter or helper_obj.type in filter:
                 yield helper_obj
             

From 992e9fe49a913b261393c8dd9460c40aaecf5542 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sat, 18 May 2024 23:35:21 +0200
Subject: [PATCH 087/140] abstractly reformulate bases task

It might still be worth doing for the sake of conceptual correctness,
even if not currently relevant for practice.
---
 src/pypdfium2/internal/bases.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py
index 8351ecc17..976763b91 100644
--- a/src/pypdfium2/internal/bases.py
+++ b/src/pypdfium2/internal/bases.py
@@ -92,7 +92,7 @@ def _add_kid(self, k):
     
     def close(self, _by_parent=False):
         
-        # TODO invalidate self.raw if closing object without finalizer (supposedly, when closing a page, child pageobject handles fall invalid)
+        # TODO invalidate self.raw if closing object without finalizer to prevent access after a lifetime-managing parent is closed
         # TODO remove object from parent's kids cache on finalization to avoid unnecessary accumulation (also for PdfPage._textpage_wrefs)
         
         if not self.raw or not self._finalizer:

From 59d0e99010d3adf6163debc3cc0bd7eec97f834e Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 28 May 2024 00:32:00 +0200
Subject: [PATCH 088/140] CLI/extract-images: Fix another dotted filepath
 blunder

Same as 7ce4d31a302c2fdc50185e35fc67513d6b3ea373.
---
 src/pypdfium2/_cli/extract_images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pypdfium2/_cli/extract_images.py b/src/pypdfium2/_cli/extract_images.py
index df4e18e65..efa7790cb 100644
--- a/src/pypdfium2/_cli/extract_images.py
+++ b/src/pypdfium2/_cli/extract_images.py
@@ -68,7 +68,7 @@ def main(args):
             try:
                 if args.use_bitmap:
                     pil_image = image.get_bitmap(render=args.render).to_pil()
-                    pil_image.save( prefix.with_suffix("."+args.format) )
+                    pil_image.save(f"{prefix}.{args.format}")
                 else:
                     image.extract(prefix, fb_format=args.format)
             except pdfium.PdfiumError:

From af81b4754ea32292fb2db43a04b41c190d9dcf25 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 31 May 2024 16:30:31 +0200
Subject: [PATCH 089/140] Remove separate `_textpage_wrefs`

Given that PdfPage.get_objects() no longer falsely registers pageobjects
as kids, textpages are now the only members added to a page's kids
cache, nicely simplifying this code passage.

However, even more future proof would be to turn kids into a mapping
{"type": [*objects], ...}, rather than a shallow list of mixed types, so
we could access all kids of a type without overhead.
---
 src/pypdfium2/_helpers/page.py  | 13 ++++++-------
 src/pypdfium2/internal/bases.py |  3 ++-
 tests/test_textpage.py          |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index b539850fd..f992c723c 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -35,7 +35,6 @@ def __init__(self, raw, pdf, formenv):
         self.raw = raw
         self.pdf = pdf
         self.formenv = formenv
-        self._textpage_wrefs = []
         super().__init__(PdfPage._close_impl, self.formenv)
     
     
@@ -196,7 +195,6 @@ def get_textpage(self):
             raise PdfiumError("Failed to load text page.")
         textpage = PdfTextPage(raw_textpage, self)
         self._add_kid(textpage)
-        self._textpage_wrefs.append( weakref.ref(textpage) )
         return textpage
     
     
@@ -242,11 +240,12 @@ def remove_obj(self, pageobj):
         
         # https://pdfium-review.googlesource.com/c/pdfium/+/118914
         if pageobj.type == pdfium_c.FPDF_PAGEOBJ_TEXT:
-            for wref in self._textpage_wrefs:
-                textpage = wref()
-                if textpage and textpage.raw:
-                    logger.warning(f"When removing a text pageobject, any textpage handles ought to be closed beforehand - auto-closing {textpage}.")
-                    textpage.close()
+            for wref in self._kids:
+                obj = wref()
+                if obj and obj.raw:
+                    assert isinstance(obj, PdfTextPage), "This code assumes all kids of a page are textpages."
+                    logger.warning(f"Removing text pageobbject implicitly closes affected textpage {obj}.")
+                    obj.close()
         
         ok = pdfium_c.FPDFPage_RemoveObject(self, pageobj)
         if not ok:
diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py
index 976763b91..ba347f528 100644
--- a/src/pypdfium2/internal/bases.py
+++ b/src/pypdfium2/internal/bases.py
@@ -92,8 +92,9 @@ def _add_kid(self, k):
     
     def close(self, _by_parent=False):
         
+        # TODO remove object from parent's kids cache on finalization to avoid unnecessary accumulation
+        # -> pre-requisite would be to handle kids inside finalizer, but IIRC there was some weird issue with that?
         # TODO invalidate self.raw if closing object without finalizer to prevent access after a lifetime-managing parent is closed
-        # TODO remove object from parent's kids cache on finalization to avoid unnecessary accumulation (also for PdfPage._textpage_wrefs)
         
         if not self.raw or not self._finalizer:
             return False
diff --git a/tests/test_textpage.py b/tests/test_textpage.py
index be21a5386..60789d8a3 100644
--- a/tests/test_textpage.py
+++ b/tests/test_textpage.py
@@ -162,9 +162,9 @@ def test_autoclose_with_remove_obj(caplog, explicit_close):
     pdf = pdfium.PdfDocument(TestFiles.text)
     page = pdf[0]
     textobj = next( page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_TEXT]) )
-    assert len(page._textpage_wrefs) == 0
+    assert len(page._kids) == 0
     textpage = page.get_textpage()
-    assert len(page._textpage_wrefs) == 1
+    assert len(page._kids) == 1
     
     if explicit_close:
         textpage.close()
@@ -174,4 +174,4 @@ def test_autoclose_with_remove_obj(caplog, explicit_close):
     if explicit_close:
         assert not caplog.text
     else:
-        assert f"When removing a text pageobject, any textpage handles ought to be closed beforehand - auto-closing {textpage}." in caplog.text
+        assert f"Removing text pageobbject implicitly closes affected textpage {textpage}." in caplog.text

From 45de679f7005bd7bac08b62ae575598d2deb7c57 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 4 Jun 2024 18:24:50 +0200
Subject: [PATCH 090/140] Clarify `Cannot close object; library is destroyed`
 condition

CC https://github.com/mindee/doctr/pull/1624
---
 src/pypdfium2/internal/bases.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pypdfium2/internal/bases.py b/src/pypdfium2/internal/bases.py
index ba347f528..823cfabe3 100644
--- a/src/pypdfium2/internal/bases.py
+++ b/src/pypdfium2/internal/bases.py
@@ -39,7 +39,7 @@ def _close_template(close_func, raw, obj_repr, state, parent, *args, **kwargs):
         os.write(sys.stderr.fileno(), f"Close ({desc}) {obj_repr}\n".encode())
     
     if not LIBRARY_AVAILABLE:
-        os.write(sys.stderr.fileno(), f"-> Cannot close object, library is destroyed. This may cause a memory leak!\n".encode())
+        os.write(sys.stderr.fileno(), f"-> Cannot close object; library is destroyed. This may happen on process exit, but should not during runtime.\n".encode())
         return
     
     assert parent is None or not parent._tree_closed()

From 3596eb09b67a609e7889d1729d7503a119ca55a8 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 4 Jul 2024 12:32:56 +0200
Subject: [PATCH 091/140] Correct PdfBookmark.get_count() docstring

---
 src/pypdfium2/_helpers/document.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 4a379def6..abff23207 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -647,8 +647,9 @@ def get_title(self):
     def get_count(self):
         """
         Returns:
-            int: Signed number of direct child bookmarks (i.e. non-recursive). Zero if the bookmark has no descendants.
-            The initial state shall be closed (collapsed) if negative, open (expanded) if positive.
+            int: Signed number of child bookmarks that would be visible if the bookmark were open (i.e. recursively counting children of open children).
+            The bookmark's initial state is open (expanded) if the number is positive, closed (collapsed) if negative.
+            Zero if the bookmark has no descendants.
         """
         return pdfium_c.FPDFBookmark_GetCount(self)
     

From 85eadfbe6010db46971fe21db5f2a30da534a9b1 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 11 Jul 2024 12:33:05 +0200
Subject: [PATCH 092/140] rendering: lightness inversion for PIL

---
 src/pypdfium2/_cli/render.py | 88 +++++++++++++++++++++++++++++++-----
 1 file changed, 76 insertions(+), 12 deletions(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 2236d346a..6b39f7cb9 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -4,11 +4,18 @@
 import os
 import math
 import logging
+import colorsys
 import functools
 from pathlib import Path
 import multiprocessing as mp
 import concurrent.futures as ft
 
+try:
+    import PIL.Image
+    import PIL.ImageFilter
+    import PIL.ImageDraw
+except ImportError:
+    PIL = None
 try:
     import cv2
 except ImportError:
@@ -188,6 +195,21 @@ def attach(parser):
         type = str.lower,
         help = "The map function to use (backend specific, the default is an iterative map)."
     )
+    
+    postproc = parser.add_argument_group(
+        title = "Post processing",
+        description = "Options to post-process rendered images. Note, this may have a strongly negative impact on performance.",
+    )
+    postproc.add_argument(
+        "--invert-lightness",
+        action = "store_true",
+        help = "Invert lightness using the HLS color space (e.g. white<->black, dark_blue<->light_blue). The intent is to achieve a dark theme for documents with light background, while providing better visual results than classical color inversion or a flat pdfium color scheme.",
+    )
+    postproc.add_argument(
+        "--exclude-images",
+        action = "store_true",
+        help = "Whether to exclude PDF images from lightness inversion.",
+    )
 
 
 class SavingEngine:
@@ -199,22 +221,58 @@ def _get_path(self, i):
         output_dir, prefix, n_digits, format = self._path_parts
         return output_dir / f"{prefix}{i+1:0{n_digits}d}.{format}"
     
-    def __call__(self, bitmap, i):
+    def __call__(self, i, bitmap, page, postproc_kwargs):
         out_path = self._get_path(i)
-        self._saving_hook(out_path, bitmap)
+        self._saving_hook(out_path, bitmap, page, postproc_kwargs)
         logger.info(f"Wrote page {i+1} as {out_path.name}")
 
 
 class PILEngine (SavingEngine):
-    def _saving_hook(self, out_path, bitmap):
-        bitmap.to_pil().save(out_path)
+    
+    def _saving_hook(self, out_path, bitmap, page, postproc_kwargs):
+        pil_image = bitmap.to_pil()
+        posconv = bitmap.get_posconv(page)
+        pil_image = self.postprocess(pil_image, page, posconv, **postproc_kwargs)
+        pil_image.save(out_path)
+    
+    LINV_LUT_SIZE = 17
+    
+    @staticmethod
+    def _invert_px_lightness(r, g, b):
+        h, l, s = colorsys.rgb_to_hls(r, g, b)
+        l = 1 - l
+        return colorsys.hls_to_rgb(h, l, s)
+    
+    @classmethod
+    @functools.lru_cache(maxsize=1)
+    def _get_linv_lut(cls):
+        return PIL.ImageFilter.Color3DLUT.generate(cls.LINV_LUT_SIZE, cls._invert_px_lightness)
+    
+    @classmethod
+    def postprocess(cls, image, page, posconv, invert_lightness, exclude_images):
+        out_image = image
+        if invert_lightness:
+            out_image = image.filter(cls._get_linv_lut())
+            if exclude_images:
+                # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates
+                images = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
+                if len(images) > 0:
+                    mask = PIL.Image.new("1", image.size)
+                    draw = PIL.ImageDraw.Draw(mask)
+                    for obj in images:
+                        qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()]
+                        draw.polygon(qpoints, fill=1, outline=1)
+                    out_image.paste(image, mask=mask)
+        return out_image
+
 
 class NumpyCV2Engine (SavingEngine):
-    def _saving_hook(self, out_path, bitmap):
+    def _saving_hook(self, out_path, bitmap, page, postproc_kwargs):
+        # TODO post-processing
         cv2.imwrite(str(out_path), bitmap.to_numpy())
 
 
-def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine):
+def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine, postproc_kwargs):
     
     if extra_init:
         extra_init()
@@ -226,17 +284,18 @@ def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, e
         pdf.init_forms()
     
     global ProcObjs
-    ProcObjs = (pdf, kwargs, engine)
+    ProcObjs = (pdf, kwargs, engine, postproc_kwargs)
 
 
-def _render_job(i, pdf, kwargs, engine):
+def _render_job(i, pdf, kwargs, engine, postproc_kwargs):
     # logger.info(f"Started page {i+1} ...")
     page = pdf[i]
     bitmap = page.render(**kwargs)
-    engine(bitmap, i)
+    engine(i, bitmap, page, postproc_kwargs)
 
 def _render_parallel_job(i):
-    global ProcObjs; _render_job(i, *ProcObjs)
+    global ProcObjs
+    _render_job(i, *ProcObjs)
 
 
 # TODO turn into a python-usable API yielding output paths as they are written
@@ -288,6 +347,11 @@ def main(args):
     for type in args.no_antialias:
         kwargs[f"no_smooth{type}"] = True
     
+    postproc_kwargs = dict(
+        invert_lightness = args.invert_lightness,
+        exclude_images = args.exclude_images,
+    )
+    
     # TODO dump all args except password?
     logger.info(f"{args.engine_cls.__name__}, Format: {args.format}, rev_byteorder: {args.rev_byteorder}, prefer_bgrx {args.prefer_bgrx}")
     
@@ -299,7 +363,7 @@ def main(args):
         
         logger.info("Linear rendering ...")
         for i in args.pages:
-            _render_job(i, pdf, kwargs, engine)
+            _render_job(i, pdf, kwargs, engine, postproc_kwargs)
         
     else:
         
@@ -317,7 +381,7 @@ def main(args):
         extra_init = (setup_logging if args.parallel_strategy in ("spawn", "forkserver") else None)
         pool_kwargs = dict(
             initializer = _render_parallel_init,
-            initargs = (extra_init, pdf._input, args.password, args.draw_forms, kwargs, engine),
+            initargs = (extra_init, pdf._input, args.password, args.draw_forms, kwargs, engine, postproc_kwargs),
         )
         
         n_procs = min(args.processes, len(args.pages))

From c907e1ebb74a2f1a406bcfba91fee663612889b5 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 11 Jul 2024 14:23:32 +0200
Subject: [PATCH 093/140] Add OpenCV lightness inversion

TODO: image exclusion
---
 src/pypdfium2/_cli/render.py | 46 ++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 6b39f7cb9..47b5d5643 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -230,46 +230,66 @@ def __call__(self, i, bitmap, page, postproc_kwargs):
 class PILEngine (SavingEngine):
     
     def _saving_hook(self, out_path, bitmap, page, postproc_kwargs):
-        pil_image = bitmap.to_pil()
         posconv = bitmap.get_posconv(page)
+        pil_image = bitmap.to_pil()
         pil_image = self.postprocess(pil_image, page, posconv, **postproc_kwargs)
         pil_image.save(out_path)
     
-    LINV_LUT_SIZE = 17
-    
     @staticmethod
     def _invert_px_lightness(r, g, b):
         h, l, s = colorsys.rgb_to_hls(r, g, b)
         l = 1 - l
         return colorsys.hls_to_rgb(h, l, s)
     
+    LINV_LUT_SIZE = 17
+    
     @classmethod
     @functools.lru_cache(maxsize=1)
     def _get_linv_lut(cls):
         return PIL.ImageFilter.Color3DLUT.generate(cls.LINV_LUT_SIZE, cls._invert_px_lightness)
     
     @classmethod
-    def postprocess(cls, image, page, posconv, invert_lightness, exclude_images):
-        out_image = image
+    def postprocess(cls, orig_image, page, posconv, invert_lightness, exclude_images):
+        out_image = orig_image
         if invert_lightness:
-            out_image = image.filter(cls._get_linv_lut())
+            out_image = out_image.filter(cls._get_linv_lut())
             if exclude_images:
                 # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates
-                images = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
-                if len(images) > 0:
-                    mask = PIL.Image.new("1", image.size)
+                image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
+                if len(image_objs) > 0:
+                    mask = PIL.Image.new("1", orig_image.size)
                     draw = PIL.ImageDraw.Draw(mask)
-                    for obj in images:
+                    for obj in image_objs:
                         qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()]
                         draw.polygon(qpoints, fill=1, outline=1)
-                    out_image.paste(image, mask=mask)
+                    out_image.paste(orig_image, mask=mask)
         return out_image
 
 
 class NumpyCV2Engine (SavingEngine):
+    
     def _saving_hook(self, out_path, bitmap, page, postproc_kwargs):
-        # TODO post-processing
-        cv2.imwrite(str(out_path), bitmap.to_numpy())
+        np_array = bitmap.to_numpy()
+        np_array = self.postprocess(np_array, bitmap, page, **postproc_kwargs)
+        cv2.imwrite(str(out_path), np_array)
+    
+    @classmethod
+    def postprocess(cls, image, bitmap, page, invert_lightness, exclude_images):
+        if invert_lightness:
+            # posconv = bitmap.get_posconv(page)
+            assert bitmap.format == pdfium_r.FPDFBitmap_BGR, "Lightness inversion is only implemented for RGB/BGR"
+            if bitmap.rev_byteorder:
+                convert_to = cv2.COLOR_RGB2HLS
+                convert_from = cv2.COLOR_HLS2RGB
+            else:
+                convert_to = cv2.COLOR_BGR2HLS
+                convert_from = cv2.COLOR_HLS2BGR
+            image = cv2.cvtColor(image, convert_to)
+            h, l, s = cv2.split(image)
+            l = ~l
+            image = cv2.merge([h, l, s])
+            image = cv2.cvtColor(image, convert_from)
+        return image
 
 
 def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine, postproc_kwargs):

From 736101d36159833b334790cdc32a662b3b874539 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 11 Jul 2024 15:35:43 +0200
Subject: [PATCH 094/140] Implement opencv image exclusion

Thew, that was tough.
In particular, the argument order for copyTo() was really confusing,
because the C signature is (src, dst, mask), whereas the python
signature is (src, mask, dst).
---
 src/pypdfium2/_cli/render.py | 38 ++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 47b5d5643..fbd1c7395 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -18,8 +18,10 @@
     PIL = None
 try:
     import cv2
+    import numpy as np
 except ImportError:
     cv2 = None
+    np = None
 
 import pypdfium2._helpers as pdfium
 import pypdfium2.internal as pdfium_i
@@ -249,21 +251,21 @@ def _get_linv_lut(cls):
         return PIL.ImageFilter.Color3DLUT.generate(cls.LINV_LUT_SIZE, cls._invert_px_lightness)
     
     @classmethod
-    def postprocess(cls, orig_image, page, posconv, invert_lightness, exclude_images):
-        out_image = orig_image
+    def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images):
+        dst_image = src_image
         if invert_lightness:
-            out_image = out_image.filter(cls._get_linv_lut())
+            dst_image = dst_image.filter(cls._get_linv_lut())
             if exclude_images:
                 # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates
                 image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
                 if len(image_objs) > 0:
-                    mask = PIL.Image.new("1", orig_image.size)
+                    mask = PIL.Image.new("1", src_image.size)
                     draw = PIL.ImageDraw.Draw(mask)
                     for obj in image_objs:
                         qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()]
                         draw.polygon(qpoints, fill=1, outline=1)
-                    out_image.paste(orig_image, mask=mask)
-        return out_image
+                    dst_image.paste(src_image, mask=mask)
+        return dst_image
 
 
 class NumpyCV2Engine (SavingEngine):
@@ -274,9 +276,9 @@ def _saving_hook(self, out_path, bitmap, page, postproc_kwargs):
         cv2.imwrite(str(out_path), np_array)
     
     @classmethod
-    def postprocess(cls, image, bitmap, page, invert_lightness, exclude_images):
+    def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
+        dst_image = src_image
         if invert_lightness:
-            # posconv = bitmap.get_posconv(page)
             assert bitmap.format == pdfium_r.FPDFBitmap_BGR, "Lightness inversion is only implemented for RGB/BGR"
             if bitmap.rev_byteorder:
                 convert_to = cv2.COLOR_RGB2HLS
@@ -284,12 +286,22 @@ def postprocess(cls, image, bitmap, page, invert_lightness, exclude_images):
             else:
                 convert_to = cv2.COLOR_BGR2HLS
                 convert_from = cv2.COLOR_HLS2BGR
-            image = cv2.cvtColor(image, convert_to)
-            h, l, s = cv2.split(image)
+            dst_image = cv2.cvtColor(dst_image, convert_to)
+            h, l, s = cv2.split(dst_image)
             l = ~l
-            image = cv2.merge([h, l, s])
-            image = cv2.cvtColor(image, convert_from)
-        return image
+            dst_image = cv2.merge([h, l, s])
+            dst_image = cv2.cvtColor(dst_image, convert_from)
+            if exclude_images:
+                posconv = bitmap.get_posconv(page)
+                image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
+                if len(image_objs) > 0:
+                    mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8)
+                    for obj in image_objs:
+                        qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()]
+                        qpoints = np.array(qpoints, np.int32)
+                        cv2.fillPoly(mask, [qpoints], 1)
+                    cv2.copyTo(src_image, mask=mask, dst=dst_image)
+        return dst_image
 
 
 def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine, postproc_kwargs):

From 822c1b763494e333cf29e8b06cc2048ee53521ce Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 11 Jul 2024 15:42:09 +0200
Subject: [PATCH 095/140] opencv: fill all polygons in one go

---
 src/pypdfium2/_cli/render.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index fbd1c7395..7e12d3b3f 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -296,10 +296,8 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
                 image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
                 if len(image_objs) > 0:
                     mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8)
-                    for obj in image_objs:
-                        qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()]
-                        qpoints = np.array(qpoints, np.int32)
-                        cv2.fillPoly(mask, [qpoints], 1)
+                    polygons = [np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32) for obj in image_objs]
+                    cv2.fillPoly(mask, polygons, 1)
                     cv2.copyTo(src_image, mask=mask, dst=dst_image)
         return dst_image
 

From 2746244996df924b42cf28b16081e3f481f195ed Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 11 Jul 2024 15:43:15 +0200
Subject: [PATCH 096/140] Revert "opencv: fill all polygons in one go"

This did the wrong thing when polygons intersect.
---
 src/pypdfium2/_cli/render.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 7e12d3b3f..a2f954357 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -296,8 +296,9 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
                 image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
                 if len(image_objs) > 0:
                     mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8)
-                    polygons = [np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32) for obj in image_objs]
-                    cv2.fillPoly(mask, polygons, 1)
+                    for obj in image_objs:
+                        qpoints = np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32)
+                        cv2.fillPoly(mask, [qpoints], 1)
                     cv2.copyTo(src_image, mask=mask, dst=dst_image)
         return dst_image
 

From e68d3da5483af1cada880324123e90f8074b4641 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 11 Jul 2024 15:46:44 +0200
Subject: [PATCH 097/140] Add some line breaks

---
 src/pypdfium2/_cli/render.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index a2f954357..bdffbc5fa 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -278,19 +278,23 @@ def _saving_hook(self, out_path, bitmap, page, postproc_kwargs):
     @classmethod
     def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
         dst_image = src_image
+        
         if invert_lightness:
             assert bitmap.format == pdfium_r.FPDFBitmap_BGR, "Lightness inversion is only implemented for RGB/BGR"
+            
             if bitmap.rev_byteorder:
                 convert_to = cv2.COLOR_RGB2HLS
                 convert_from = cv2.COLOR_HLS2RGB
             else:
                 convert_to = cv2.COLOR_BGR2HLS
                 convert_from = cv2.COLOR_HLS2BGR
+            
             dst_image = cv2.cvtColor(dst_image, convert_to)
             h, l, s = cv2.split(dst_image)
             l = ~l
             dst_image = cv2.merge([h, l, s])
             dst_image = cv2.cvtColor(dst_image, convert_from)
+            
             if exclude_images:
                 posconv = bitmap.get_posconv(page)
                 image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
@@ -300,6 +304,7 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
                         qpoints = np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32)
                         cv2.fillPoly(mask, [qpoints], 1)
                     cv2.copyTo(src_image, mask=mask, dst=dst_image)
+            
         return dst_image
 
 

From 428e970375b711aa769bd742661a531a55ae0987 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 12 Jul 2024 14:01:00 +0200
Subject: [PATCH 098/140] pil/polygon: don't draw an outline

---
 src/pypdfium2/_cli/render.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index bdffbc5fa..d330829d0 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -263,7 +263,7 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images)
                     draw = PIL.ImageDraw.Draw(mask)
                     for obj in image_objs:
                         qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()]
-                        draw.polygon(qpoints, fill=1, outline=1)
+                        draw.polygon(qpoints, fill=1)
                     dst_image.paste(src_image, mask=mask)
         return dst_image
 

From 2bb67665d8fcd02f5497525fb67adc6799b0c277 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 12 Jul 2024 14:07:47 +0200
Subject: [PATCH 099/140] Add missing mkdir with refbindings (fixes #320)

---
 setupsrc/pypdfium2_setup/packaging_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py
index 099a9ac59..887a6c6b1 100644
--- a/setupsrc/pypdfium2_setup/packaging_base.py
+++ b/setupsrc/pypdfium2_setup/packaging_base.py
@@ -494,6 +494,7 @@ def build_pdfium_bindings(version, headers_dir=None, **kwargs):
         flags_diff = set(kwargs["flags"]).difference(REFBINDINGS_FLAGS)
         if flags_diff:  # == not set(...).issubset(...)
             print(f"Warning: The following requested flags are not available in the reference bindings and will be discarded: {flags_diff}")
+        DataDir_Bindings.mkdir(parents=True, exist_ok=True)
         shutil.copyfile(RefBindingsFile, DataDir_Bindings/BindingsFN)
         write_json(ver_path, dict(version=bindings_ver, flags=REFBINDINGS_FLAGS, run_lds=["."], source="reference"))
         return

From 775fb491e3283ebfdbb4a572285e65ed51f6ac83 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 12 Jul 2024 14:37:12 +0200
Subject: [PATCH 100/140] lightness inversion: expand pixel formats compat

---
 src/pypdfium2/_cli/render.py | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index d330829d0..5151a7d4a 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -12,6 +12,7 @@
 
 try:
     import PIL.Image
+    import PIL.ImageOps
     import PIL.ImageFilter
     import PIL.ImageDraw
 except ImportError:
@@ -254,7 +255,10 @@ def _get_linv_lut(cls):
     def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images):
         dst_image = src_image
         if invert_lightness:
-            dst_image = dst_image.filter(cls._get_linv_lut())
+            if src_image.mode == "L":
+                dst_image = PIL.ImageOps.invert(src_image)
+            else:
+                dst_image = dst_image.filter(cls._get_linv_lut())
             if exclude_images:
                 # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates
                 image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
@@ -280,22 +284,26 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
         dst_image = src_image
         
         if invert_lightness:
-            assert bitmap.format == pdfium_r.FPDFBitmap_BGR, "Lightness inversion is only implemented for RGB/BGR"
             
-            if bitmap.rev_byteorder:
-                convert_to = cv2.COLOR_RGB2HLS
-                convert_from = cv2.COLOR_HLS2RGB
+            if bitmap.format == pdfium_r.FPDFBitmap_Gray:
+                dst_image = 255 - src_image
             else:
-                convert_to = cv2.COLOR_BGR2HLS
-                convert_from = cv2.COLOR_HLS2BGR
-            
-            dst_image = cv2.cvtColor(dst_image, convert_to)
-            h, l, s = cv2.split(dst_image)
-            l = ~l
-            dst_image = cv2.merge([h, l, s])
-            dst_image = cv2.cvtColor(dst_image, convert_from)
+                
+                if bitmap.rev_byteorder:
+                    convert_to = cv2.COLOR_RGB2HLS
+                    convert_from = cv2.COLOR_HLS2RGB
+                else:
+                    convert_to = cv2.COLOR_BGR2HLS
+                    convert_from = cv2.COLOR_HLS2BGR
+                
+                dst_image = cv2.cvtColor(dst_image, convert_to)
+                h, l, s = cv2.split(dst_image)
+                l = ~l
+                dst_image = cv2.merge([h, l, s])
+                dst_image = cv2.cvtColor(dst_image, convert_from)
             
             if exclude_images:
+                assert bitmap.format != pdfium_r.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2"
                 posconv = bitmap.get_posconv(page)
                 image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
                 if len(image_objs) > 0:
@@ -303,7 +311,7 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
                     for obj in image_objs:
                         qpoints = np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32)
                         cv2.fillPoly(mask, [qpoints], 1)
-                    cv2.copyTo(src_image, mask=mask, dst=dst_image)
+                    dst_image = cv2.copyTo(src_image, mask=mask, dst=dst_image)
             
         return dst_image
 

From bc42d19161d0e6a7d18a11c39a92d0d18a5f47b5 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 12 Jul 2024 20:58:39 +0200
Subject: [PATCH 101/140] Remove wrong comments

---
 src/pypdfium2/_cli/imgtopdf.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/pypdfium2/_cli/imgtopdf.py b/src/pypdfium2/_cli/imgtopdf.py
index 542d637e2..6c3ec82a6 100644
--- a/src/pypdfium2/_cli/imgtopdf.py
+++ b/src/pypdfium2/_cli/imgtopdf.py
@@ -38,8 +38,6 @@ def main(args):
     # Due to limitations in PDFium's public API, this function may be inefficient/lossy for non-JPEG input.
     # The technically best available open-source tool for image to PDF conversion is probably img2pdf (although its code style can be regarded as displeasing).
     
-    # Development note: We are closing objects explicitly because loading JPEGs non-inline binds file handles to the PDF, which need to be released as soon as possible. Without this, we have already run into "OSError: Too many open files" while testing.
-    
     pdf = pdfium.PdfDocument.new()
     
     for fp in args.images:

From 7694cea59c85f219d6bc947f35c55be304909352 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sat, 13 Jul 2024 13:54:22 +0200
Subject: [PATCH 102/140] [Experimental] Defer imports of optional dependencies

---
 README.md                             |  8 ++--
 src/pypdfium2/_cli/imgtopdf.py        | 10 ++---
 src/pypdfium2/_cli/render.py          | 57 +++++++++++++++------------
 src/pypdfium2/_helpers/bitmap.py      | 16 +++-----
 src/pypdfium2/_helpers/pageobjects.py | 15 +++----
 5 files changed, 54 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index 1c045589f..7933c090b 100644
--- a/README.md
+++ b/README.md
@@ -176,12 +176,14 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct
 
 ### Runtime Dependencies
 
-As of this writing, pypdfium2 does not need any mandatory runtime dependencies apart from Python itself.
+As of this writing, pypdfium2 does not require any mandatory runtime dependencies apart from Python itself.
 
-However, some optional support model features require additional packages:
-* [`Pillow`](https://pillow.readthedocs.io/en/stable/) (module name `PIL`) is a pouplar imaging library for Python. pypdfium2 provides convenience methods to translate between raw bitmap buffers and PIL images.
+However, some optional support model features need additional packages:
+* [`Pillow`](https://pillow.readthedocs.io/en/stable/) (module `PIL`) is a pouplar imaging library for Python. pypdfium2 provides convenience adapters to translate between raw bitmap buffers and PIL images. It also uses PIL for some command-line functionality (e.g. image saving).
 * [`NumPy`](https://numpy.org/doc/stable/index.html) is a library for scientific computing. Similar to `Pillow`, pypdfium2 provides helpers to get a numpy array view of a raw bitmap.
+* [`opencv-python`](https://github.com/opencv/opencv-python) (module `cv2`) is an imaging library built around numpy arrays. It can be used in the rendering CLI to save with pypdfium2's numpy adapter.
 
+pypdfium2 tries to defer imports of optional dependencies to the scopes where they are actually accessed, so there should be no startup overhead if you don't use them.
 
 ### Setup Magic
 
diff --git a/src/pypdfium2/_cli/imgtopdf.py b/src/pypdfium2/_cli/imgtopdf.py
index 6c3ec82a6..2238267ac 100644
--- a/src/pypdfium2/_cli/imgtopdf.py
+++ b/src/pypdfium2/_cli/imgtopdf.py
@@ -6,11 +6,6 @@
 from pathlib import Path
 import pypdfium2._helpers as pdfium
 
-try:
-    import PIL.Image
-except ImportError:
-    PIL = None
-
 
 def attach(parser):
     parser.add_argument(
@@ -34,6 +29,11 @@ def attach(parser):
 
 def main(args):
     
+    try:
+        import PIL.Image
+    except ImportError:
+        PIL = None  # JPEG can be convered without PIL
+    
     # Rudimentary image to PDF conversion (testing / proof of concept)
     # Due to limitations in PDFium's public API, this function may be inefficient/lossy for non-JPEG input.
     # The technically best available open-source tool for image to PDF conversion is probably img2pdf (although its code style can be regarded as displeasing).
diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 5151a7d4a..3cf6ecf20 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -9,20 +9,7 @@
 from pathlib import Path
 import multiprocessing as mp
 import concurrent.futures as ft
-
-try:
-    import PIL.Image
-    import PIL.ImageOps
-    import PIL.ImageFilter
-    import PIL.ImageDraw
-except ImportError:
-    PIL = None
-try:
-    import cv2
-    import numpy as np
-except ImportError:
-    cv2 = None
-    np = None
+from importlib.util import find_spec
 
 import pypdfium2._helpers as pdfium
 import pypdfium2.internal as pdfium_i
@@ -33,6 +20,7 @@
     BooleanOptionalAction,
 )
 
+have_cv2 = find_spec("cv2") is not None
 logger = logging.getLogger(__name__)
 
 
@@ -217,21 +205,31 @@ def attach(parser):
 
 class SavingEngine:
     
-    def __init__(self, path_parts):
+    def __init__(self, path_parts, postproc_kwargs):
         self._path_parts = path_parts
+        self.postproc_kwargs = postproc_kwargs
     
     def _get_path(self, i):
         output_dir, prefix, n_digits, format = self._path_parts
         return output_dir / f"{prefix}{i+1:0{n_digits}d}.{format}"
     
-    def __call__(self, i, bitmap, page, postproc_kwargs):
+    def __call__(self, i, bitmap, page):
         out_path = self._get_path(i)
-        self._saving_hook(out_path, bitmap, page, postproc_kwargs)
+        self._saving_hook(out_path, bitmap, page, self.postproc_kwargs)
         logger.info(f"Wrote page {i+1} as {out_path.name}")
 
 
 class PILEngine (SavingEngine):
     
+    def do_imports(self):
+        if not self.postproc_kwargs["invert_lightness"]:
+            return
+        global PIL
+        import PIL.Image
+        import PIL.ImageOps
+        import PIL.ImageFilter
+        import PIL.ImageDraw
+    
     def _saving_hook(self, out_path, bitmap, page, postproc_kwargs):
         posconv = bitmap.get_posconv(page)
         pil_image = bitmap.to_pil()
@@ -274,6 +272,12 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images)
 
 class NumpyCV2Engine (SavingEngine):
     
+    @staticmethod
+    def do_imports():
+        global cv2, np
+        import cv2
+        import numpy as np
+    
     def _saving_hook(self, out_path, bitmap, page, postproc_kwargs):
         np_array = bitmap.to_numpy()
         np_array = self.postprocess(np_array, bitmap, page, **postproc_kwargs)
@@ -316,7 +320,7 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
         return dst_image
 
 
-def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine, postproc_kwargs):
+def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine):
     
     if extra_init:
         extra_init()
@@ -327,15 +331,17 @@ def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, e
     if may_init_forms:
         pdf.init_forms()
     
+    engine.do_imports()
+    
     global ProcObjs
-    ProcObjs = (pdf, kwargs, engine, postproc_kwargs)
+    ProcObjs = (pdf, kwargs, engine)
 
 
-def _render_job(i, pdf, kwargs, engine, postproc_kwargs):
+def _render_job(i, pdf, kwargs, engine):
     # logger.info(f"Started page {i+1} ...")
     page = pdf[i]
     bitmap = page.render(**kwargs)
-    engine(i, bitmap, page, postproc_kwargs)
+    engine(i, bitmap, page)
 
 def _render_parallel_job(i):
     global ProcObjs
@@ -362,7 +368,7 @@ def main(args):
     
     # numpy+cv2 is much faster for PNG, and PIL faster for JPG, but this might simply be due to different encoding defaults
     if args.engine_cls is None:
-        if cv2 != None and args.format == "png":
+        if have_cv2 != None and args.format == "png":
             args.engine_cls = NumpyCV2Engine
         else:
             args.engine_cls = PILEngine
@@ -401,13 +407,14 @@ def main(args):
     
     n_digits = len(str(pdf_len))
     path_parts = (args.output, args.prefix, n_digits, args.format)
-    engine = args.engine_cls(path_parts)
+    engine = args.engine_cls(path_parts, postproc_kwargs)
     
     if len(args.pages) <= args.linear:
         
         logger.info("Linear rendering ...")
+        engine.do_imports()
         for i in args.pages:
-            _render_job(i, pdf, kwargs, engine, postproc_kwargs)
+            _render_job(i, pdf, kwargs, engine)
         
     else:
         
@@ -425,7 +432,7 @@ def main(args):
         extra_init = (setup_logging if args.parallel_strategy in ("spawn", "forkserver") else None)
         pool_kwargs = dict(
             initializer = _render_parallel_init,
-            initargs = (extra_init, pdf._input, args.password, args.draw_forms, kwargs, engine, postproc_kwargs),
+            initargs = (extra_init, pdf._input, args.password, args.draw_forms, kwargs, engine),
         )
         
         n_procs = min(args.processes, len(args.pages))
diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index c2f75f519..1d17d1e62 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -11,16 +11,6 @@
 
 logger = logging.getLogger(__name__)
 
-try:
-    import PIL.Image
-except ImportError:
-    PIL = None
-
-try:
-    import numpy
-except ImportError:
-    numpy = None
-
 
 class PdfBitmap (pdfium_i.AutoCloseable):
     """
@@ -215,6 +205,8 @@ def to_numpy(self):
         
         # https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray
         
+        import numpy
+        
         array = numpy.ndarray(
             # layout: row major
             shape = (self.height, self.width, self.n_channels),
@@ -242,6 +234,8 @@ def to_pil(self):
         # https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.frombuffer
         # https://pillow.readthedocs.io/en/stable/handbook/writing-your-own-image-plugin.html#the-raw-decoder
         
+        import PIL.Image
+        
         dest_mode = pdfium_i.BitmapTypeToStrReverse[self.format]
         image = PIL.Image.frombuffer(
             dest_mode,                  # target color format
@@ -300,6 +294,8 @@ def get_posconv(self, page):
 
 def _pil_convert_for_pdfium(pil_image):
     
+    import PIL.Image
+    
     if pil_image.mode == "1":
         pil_image = pil_image.convert("L")
     elif pil_image.mode.startswith("RGB"):
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 5dddd3664..409bbc1b5 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -13,11 +13,6 @@
 from pypdfium2._helpers.matrix import PdfMatrix
 from pypdfium2._helpers.bitmap import PdfBitmap
 
-try:
-    import PIL.Image
-except ImportError:
-    PIL = None
-
 
 class PdfObject (pdfium_i.AutoCloseable):
     """
@@ -384,13 +379,13 @@ class ImageNotExtractableError (Exception):
     pass
 
 
-def _get_pil_mode(colorspace, bpp):
+def _get_pil_mode(cs, bpp):
     # In theory, indexed (palettized) and ICC-based color spaces could be handled as well, but PDFium currently does not provide access to the palette or the ICC profile
-    if colorspace == pdfium_c.FPDF_COLORSPACE_DEVICEGRAY:
+    if cs == pdfium_c.FPDF_COLORSPACE_DEVICEGRAY:
         return "1" if bpp == 1 else "L"
-    elif colorspace == pdfium_c.FPDF_COLORSPACE_DEVICERGB:
+    elif cs == pdfium_c.FPDF_COLORSPACE_DEVICERGB:
         return "RGB"
-    elif colorspace == pdfium_c.FPDF_COLORSPACE_DEVICECMYK:
+    elif cs == pdfium_c.FPDF_COLORSPACE_DEVICECMYK:
         return "CMYK"
     else:
         return None
@@ -398,6 +393,8 @@ def _get_pil_mode(colorspace, bpp):
 
 def _extract_smart(image_obj, fb_format=None):
     
+    import PIL.Image
+    
     try:
         # TODO can we change PdfImage.get_data() to take an mmap, so the data could be written directly into a file rather than an in-memory array?
         data, info = _extract_direct(image_obj)

From d7fc983301ca4d507e6b774a372c8289d152b4ff Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sat, 13 Jul 2024 14:11:51 +0200
Subject: [PATCH 103/140] changelog: add ref to selective lightness inversion

---
 docs/devel/changelog_staging.md | 2 +-
 req/converters.txt              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 090507e1f..b38be9734 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -10,7 +10,7 @@
   * Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). Instead, use `PdfPage.render()` with a loop or process pool.
   * Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`.
   * `PdfBitmap.from_pil()`: Removed `recopy` param.
-  * Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark them" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion.
+  * Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark them" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion, as is now implemented in pypdfium2's rendering CLI.
 - Pageobjects
   * Renamed `PdfObject.get_pos()` to `.get_bounds()`.
   * Renamed `PdfImage.get_size()` to `.get_px_size()`.
diff --git a/req/converters.txt b/req/converters.txt
index f1c7e2688..551d15c55 100644
--- a/req/converters.txt
+++ b/req/converters.txt
@@ -1,3 +1,3 @@
-# NOTE In order to use numpy, the rendering CLI further needs `opencv-python`, but we don't currently cover that internally. As the import is guarded, we don't have to require it here.
+# NOTE In order to use numpy, the rendering CLI further needs `opencv-python[-headless]`, but we don't currently cover that internally. As the import is guarded, we don't have to require it here.
 pillow
 numpy

From 78997587ea9df59fa6b1293923e552e1446bdf57 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sat, 13 Jul 2024 17:18:12 +0200
Subject: [PATCH 104/140] Do engine imports in parent process with fork context

---
 src/pypdfium2/_cli/render.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 3cf6ecf20..ef43d66a7 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -320,19 +320,16 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
         return dst_image
 
 
-def _render_parallel_init(extra_init, input, password, may_init_forms, kwargs, engine):
-    
-    if extra_init:
-        extra_init()
+def _render_parallel_init(logging_init, engine_init, input, password, may_init_forms, kwargs, engine):
     
+    logging_init()
     logger.info(f"Initializing data for process {os.getpid()}")
+    engine_init()
     
     pdf = pdfium.PdfDocument(input, password=password, autoclose=True)
     if may_init_forms:
         pdf.init_forms()
     
-    engine.do_imports()
-    
     global ProcObjs
     ProcObjs = (pdf, kwargs, engine)
 
@@ -347,6 +344,8 @@ def _render_parallel_job(i):
     global ProcObjs
     _render_job(i, *ProcObjs)
 
+def _do_nothing(): pass
+
 
 # TODO turn into a python-usable API yielding output paths as they are written
 def main(args):
@@ -429,10 +428,15 @@ def main(args):
         if args.parallel_map:
             map_attr = args.parallel_map
         
-        extra_init = (setup_logging if args.parallel_strategy in ("spawn", "forkserver") else None)
+        if args.parallel_strategy == "fork":
+            logging_init, engine_init = _do_nothing, _do_nothing
+            engine.do_imports()
+        else:
+            logging_init, engine_init = setup_logging, engine.do_imports
+        
         pool_kwargs = dict(
             initializer = _render_parallel_init,
-            initargs = (extra_init, pdf._input, args.password, args.draw_forms, kwargs, engine),
+            initargs = (logging_init, engine_init, pdf._input, args.password, args.draw_forms, kwargs, engine),
         )
         
         n_procs = min(args.processes, len(args.pages))

From 9d715cf74d1f5e4184c35cfea256861af1ddda88 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sat, 13 Jul 2024 19:19:15 +0200
Subject: [PATCH 105/140] Use LazyLoader for deferred top-level imports

This allows us to avoid imports in functions, which potentially means in
loops.

Not changing the renderer CLI as the engine.do_imports() strategy seems
nice.

See also
https://gist.github.com/mara004/6915e904797916b961e9c53b4fc874ec for
prior research on the subject of deferred imports.
---
 README.md                             |  2 +-
 docs/devel/changelog_staging.md       |  1 +
 src/pypdfium2/_cli/imgtopdf.py        | 10 +++------
 src/pypdfium2/_cli/render.py          | 10 +++++----
 src/pypdfium2/_helpers/bitmap.py      | 17 ++++++--------
 src/pypdfium2/_helpers/pageobjects.py |  6 ++---
 src/pypdfium2/_utils.py               | 32 +++++++++++++++++++++++++++
 7 files changed, 53 insertions(+), 25 deletions(-)
 create mode 100644 src/pypdfium2/_utils.py

diff --git a/README.md b/README.md
index 7933c090b..0b6916c37 100644
--- a/README.md
+++ b/README.md
@@ -183,7 +183,7 @@ However, some optional support model features need additional packages:
 * [`NumPy`](https://numpy.org/doc/stable/index.html) is a library for scientific computing. Similar to `Pillow`, pypdfium2 provides helpers to get a numpy array view of a raw bitmap.
 * [`opencv-python`](https://github.com/opencv/opencv-python) (module `cv2`) is an imaging library built around numpy arrays. It can be used in the rendering CLI to save with pypdfium2's numpy adapter.
 
-pypdfium2 tries to defer imports of optional dependencies to the scopes where they are actually accessed, so there should be no startup overhead if you don't use them.
+pypdfium2 tries to defer imports of optional dependencies until they are actually needed, so there should be no startup overhead if you don't use them.
 
 ### Setup Magic
 
diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index b38be9734..7cd089d6c 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -26,6 +26,7 @@
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
 - If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype.
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
+- Improved startup performance by deferring imports of optional dependencies to the point where they are actually needed, to avoid overhead if you do not use them.
 - Simplified version impl (no API change expected).
 
 *Project*
diff --git a/src/pypdfium2/_cli/imgtopdf.py b/src/pypdfium2/_cli/imgtopdf.py
index 2238267ac..b43ec09c8 100644
--- a/src/pypdfium2/_cli/imgtopdf.py
+++ b/src/pypdfium2/_cli/imgtopdf.py
@@ -5,7 +5,8 @@
 
 from pathlib import Path
 import pypdfium2._helpers as pdfium
-
+from pypdfium2._utils import deferred_import
+PIL_Image = deferred_import("PIL.Image")
 
 def attach(parser):
     parser.add_argument(
@@ -29,11 +30,6 @@ def attach(parser):
 
 def main(args):
     
-    try:
-        import PIL.Image
-    except ImportError:
-        PIL = None  # JPEG can be convered without PIL
-    
     # Rudimentary image to PDF conversion (testing / proof of concept)
     # Due to limitations in PDFium's public API, this function may be inefficient/lossy for non-JPEG input.
     # The technically best available open-source tool for image to PDF conversion is probably img2pdf (although its code style can be regarded as displeasing).
@@ -48,7 +44,7 @@ def main(args):
         if fp.suffix.lower() in (".jpg", ".jpeg"):
             image_obj.load_jpeg(fp, inline=args.inline)
         else:
-            pil_image = PIL.Image.open(fp)
+            pil_image = PIL_Image.open(fp)
             bitmap = pdfium.PdfBitmap.from_pil(pil_image)
             pil_image.close()
             image_obj.set_bitmap(bitmap)
diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index ef43d66a7..f2778a57a 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -20,6 +20,7 @@
     BooleanOptionalAction,
 )
 
+have_pil = find_spec("PIL") is not None
 have_cv2 = find_spec("cv2") is not None
 logger = logging.getLogger(__name__)
 
@@ -272,11 +273,11 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images)
 
 class NumpyCV2Engine (SavingEngine):
     
-    @staticmethod
-    def do_imports():
+    def do_imports(self):
         global cv2, np
         import cv2
-        import numpy as np
+        if self.postproc_kwargs["exclude_images"]:
+            import numpy as np
     
     def _saving_hook(self, out_path, bitmap, page, postproc_kwargs):
         np_array = bitmap.to_numpy()
@@ -367,7 +368,8 @@ def main(args):
     
     # numpy+cv2 is much faster for PNG, and PIL faster for JPG, but this might simply be due to different encoding defaults
     if args.engine_cls is None:
-        if have_cv2 != None and args.format == "png":
+        assert have_pil or have_cv2, "Either pillow or numpy+cv2 must be installed for rendering CLI."
+        if (not have_pil) or (have_cv2 and args.format == "png"):
             args.engine_cls = NumpyCV2Engine
         else:
             args.engine_cls = PILEngine
diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index 1d17d1e62..c6c75fa0c 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -8,6 +8,9 @@
 import pypdfium2.raw as pdfium_c
 import pypdfium2.internal as pdfium_i
 from pypdfium2._helpers.misc import PdfiumError
+from pypdfium2._utils import deferred_import
+numpy = deferred_import("numpy")
+PIL_Image = deferred_import("PIL.Image")
 
 logger = logging.getLogger(__name__)
 
@@ -205,8 +208,6 @@ def to_numpy(self):
         
         # https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray
         
-        import numpy
-        
         array = numpy.ndarray(
             # layout: row major
             shape = (self.height, self.width, self.n_channels),
@@ -234,10 +235,8 @@ def to_pil(self):
         # https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.frombuffer
         # https://pillow.readthedocs.io/en/stable/handbook/writing-your-own-image-plugin.html#the-raw-decoder
         
-        import PIL.Image
-        
         dest_mode = pdfium_i.BitmapTypeToStrReverse[self.format]
-        image = PIL.Image.frombuffer(
+        image = PIL_Image.frombuffer(
             dest_mode,                  # target color format
             (self.width, self.height),  # size
             self.buffer,                # buffer
@@ -294,8 +293,6 @@ def get_posconv(self, page):
 
 def _pil_convert_for_pdfium(pil_image):
     
-    import PIL.Image
-    
     if pil_image.mode == "1":
         pil_image = pil_image.convert("L")
     elif pil_image.mode.startswith("RGB"):
@@ -308,14 +305,14 @@ def _pil_convert_for_pdfium(pil_image):
     # convert RGB(A/X) to BGR(A) for PDFium
     if pil_image.mode == "RGB":
         r, g, b = pil_image.split()
-        pil_image = PIL.Image.merge("RGB", (b, g, r))
+        pil_image = PIL_Image.merge("RGB", (b, g, r))
     elif pil_image.mode == "RGBA":
         r, g, b, a = pil_image.split()
-        pil_image = PIL.Image.merge("RGBA", (b, g, r, a))
+        pil_image = PIL_Image.merge("RGBA", (b, g, r, a))
     elif pil_image.mode == "RGBX":
         # technically the x channel may be unnecessary, but preserve what the caller passes in
         r, g, b, x = pil_image.split()
-        pil_image = PIL.Image.merge("RGBX", (b, g, r, x))
+        pil_image = PIL_Image.merge("RGBX", (b, g, r, x))
     
     return pil_image
 
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 409bbc1b5..084babb60 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -12,6 +12,8 @@
 from pypdfium2._helpers.misc import PdfiumError
 from pypdfium2._helpers.matrix import PdfMatrix
 from pypdfium2._helpers.bitmap import PdfBitmap
+from pypdfium2._utils import deferred_import
+PIL_Image = deferred_import("PIL.Image")
 
 
 class PdfObject (pdfium_i.AutoCloseable):
@@ -393,8 +395,6 @@ def _get_pil_mode(cs, bpp):
 
 def _extract_smart(image_obj, fb_format=None):
     
-    import PIL.Image
-    
     try:
         # TODO can we change PdfImage.get_data() to take an mmap, so the data could be written directly into a file rather than an in-memory array?
         data, info = _extract_direct(image_obj)
@@ -406,7 +406,7 @@ def _extract_smart(image_obj, fb_format=None):
         format = info.format
         if format == "raw":
             metadata = info.metadata
-            pil_image = PIL.Image.frombuffer(
+            pil_image = PIL_Image.frombuffer(
                 info.mode,
                 (metadata.width, metadata.height),
                 image_obj.get_data(decode_simple=True),
diff --git a/src/pypdfium2/_utils.py b/src/pypdfium2/_utils.py
new file mode 100644
index 000000000..a1c459281
--- /dev/null
+++ b/src/pypdfium2/_utils.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
+# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
+
+import sys
+import importlib.util
+
+
+def deferred_import(modpath):
+    
+    # FIXME If modpath points to a submodule, the parent module will be loaded immediately when this function is called. This is a limitation of the find_spec() importlib API used here. However, this may still be useful if the parent is a mere namespace package that does not contain anything expensive, as in the case of PIL.
+    
+    module = sys.modules.get(modpath, None)
+    if module is not None:
+        return module  # shortcut
+    
+    # assuming an optional dependency
+    # returning None will simply let it fail with an AttributeError when attempting to access the module
+    try:
+        spec = importlib.util.find_spec(modpath)
+    except ModuleNotFoundError:
+        return None
+    if spec is None:
+        return None
+    
+    # see https://docs.python.org/3/library/importlib.html#implementing-lazy-imports
+    loader = importlib.util.LazyLoader(spec.loader)
+    spec.loader = loader
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[modpath] = module
+    loader.exec_module(module)
+    
+    return module

From db65e002e183c2d9eb2fe4ea5209f2b5b9906e2d Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sat, 13 Jul 2024 21:44:36 +0200
Subject: [PATCH 106/140] Consistently use unary operator for inversion

---
 src/pypdfium2/_cli/render.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index f2778a57a..e1b278717 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -291,7 +291,7 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
         if invert_lightness:
             
             if bitmap.format == pdfium_r.FPDFBitmap_Gray:
-                dst_image = 255 - src_image
+                dst_image = ~src_image
             else:
                 
                 if bitmap.rev_byteorder:

From 7803b273c62d7d134a170d27091b186b53ad33f7 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sat, 13 Jul 2024 22:02:35 +0200
Subject: [PATCH 107/140] style

---
 src/pypdfium2/_cli/render.py     | 14 +++++++-------
 src/pypdfium2/_helpers/bitmap.py |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index e1b278717..9df642ae2 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -13,7 +13,7 @@
 
 import pypdfium2._helpers as pdfium
 import pypdfium2.internal as pdfium_i
-import pypdfium2.raw as pdfium_r
+import pypdfium2.raw as pdfium_c
 from pypdfium2._cli._parsers import (
     add_input, get_input,
     setup_logging,
@@ -26,9 +26,9 @@
 
 
 def _bitmap_wrapper_foreign_simple(width, height, format, *args, **kwargs):
-    if format == pdfium_r.FPDFBitmap_BGRx:
+    if format == pdfium_c.FPDFBitmap_BGRx:
         use_alpha = False
-    elif format == pdfium_r.FPDFBitmap_BGRA:
+    elif format == pdfium_c.FPDFBitmap_BGRA:
         use_alpha = True
     else:
         raise RuntimeError(f"Cannot create foreign_simple bitmap with bitmap type {pdfium_i.BitmapTypeToStr[format]}.")
@@ -260,7 +260,7 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images)
                 dst_image = dst_image.filter(cls._get_linv_lut())
             if exclude_images:
                 # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates
-                image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
+                image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1))
                 if len(image_objs) > 0:
                     mask = PIL.Image.new("1", src_image.size)
                     draw = PIL.ImageDraw.Draw(mask)
@@ -290,7 +290,7 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
         
         if invert_lightness:
             
-            if bitmap.format == pdfium_r.FPDFBitmap_Gray:
+            if bitmap.format == pdfium_c.FPDFBitmap_Gray:
                 dst_image = ~src_image
             else:
                 
@@ -308,9 +308,9 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
                 dst_image = cv2.cvtColor(dst_image, convert_from)
             
             if exclude_images:
-                assert bitmap.format != pdfium_r.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2"
+                assert bitmap.format != pdfium_c.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2"
                 posconv = bitmap.get_posconv(page)
-                image_objs = list(page.get_objects([pdfium_r.FPDF_PAGEOBJ_IMAGE], max_depth=1))
+                image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1))
                 if len(image_objs) > 0:
                     mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8)
                     for obj in image_objs:
diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index c6c75fa0c..a67c3c6d3 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -144,7 +144,7 @@ def new_foreign(cls, width, height, format, rev_byteorder=False, force_packed=Fa
         Create a new bitmap using :func:`FPDFBitmap_CreateEx`, with a buffer allocated by PDFium.
         There may be a padding of unused bytes at line end, unless *force_packed=True* is given.
         
-        Note that is encouraged to prefer :meth:`.new_native`.
+        Note that it is recommended to prefer :meth:`.new_native`.
         """
         stride = width * pdfium_i.BitmapTypeToNChannels[format] if force_packed else 0
         raw = pdfium_c.FPDFBitmap_CreateEx(width, height, format, None, stride)
@@ -156,7 +156,7 @@ def new_foreign_simple(cls, width, height, use_alpha, rev_byteorder=False):
         """
         Create a new bitmap using :func:`FPDFBitmap_Create`. The buffer is allocated by PDFium, and supposed to be packed (i. e. no gap of unused bytes between lines).
         
-        Note that it is encouraged to prefer :meth:`.new_native`.
+        Note that it is recommended to prefer :meth:`.new_native`.
         """
         raw = pdfium_c.FPDFBitmap_Create(width, height, use_alpha)
         return cls.from_raw(raw, rev_byteorder)

From b495a1ff65cf7de53070596382c764a2c7c6624f Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sat, 13 Jul 2024 22:10:26 +0200
Subject: [PATCH 108/140] add task

---
 src/pypdfium2/_cli/render.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 9df642ae2..2db39f791 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -259,7 +259,8 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images)
             else:
                 dst_image = dst_image.filter(cls._get_linv_lut())
             if exclude_images:
-                # don't descend into XObjects as I'm not sure how to translate XObject to page coordinates
+                # FIXME Not sure how to translate XObject to page coordinates. pdfium does not seem to provide an API for this, so we're currently unable to descend into XObjects.
+                # FIXME We'd also like to take into account alpha masks, but this may be difficult as long as pdfium does not expose them directly.
                 image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1))
                 if len(image_objs) > 0:
                     mask = PIL.Image.new("1", src_image.size)

From e45150abe8319f86ca1b3af705b86354ae866e91 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 14 Jul 2024 14:14:56 +0200
Subject: [PATCH 109/140] Update some wordings

---
 README.md                       | 37 +++++++++++++--------------------
 docs/devel/changelog_staging.md |  4 ++--
 src/pypdfium2/_cli/render.py    |  4 ++--
 src/pypdfium2/_utils.py         |  2 +-
 src/pypdfium2/version.py        |  6 +++---
 5 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 0b6916c37..2c9f4b1f5 100644
--- a/README.md
+++ b/README.md
@@ -98,8 +98,6 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct
   
   See [Setup Magic](#setup-magic) for details.
   
-  Support for source installs (esp. with self-built/system pdfium) is limited, as their integrity somewhat depends on a correctly acting caller.
-  
   Installing an `sdist` does not implicitly trigger a sourcebuild if no pre-built binary is available. We prefer to let callers decide consciously what to do, and run the build script without pip encapsulation.
   
   Relevant pip options:
@@ -107,6 +105,8 @@ pypdfium2 includes helpers to simplify common use cases, while the raw PDFium/ct
   * `-e`: Install in editable mode, so the installation points to the source tree. This way, changes directly take effect without needing to re-install. Recommended for development.
   * `--no-build-isolation`: Do not isolate setup in a virtual env; use the main env instead. This renders `pyproject.toml [build-system]` inactive, so setup deps must be prepared by caller. Useful to install custom versions of setup deps, or as speedup when installing repeatedly.
   
+  That said, do not expect us to provide much guidance with source installs, or to support the result, as this may be a crafty process, and we can't be sure whether it was done correctly (e.g. ABI safety, ctypesgen version used, etc.).
+  
   [^pdfium_buildsystem]: This means pdfium may not compile on arbitrary hosts. The script is limited to build hosts supported by Google's toolchain. Ideally, we'd need an alternative build system that runs with system packages instead.
 
 
@@ -180,7 +180,7 @@ As of this writing, pypdfium2 does not require any mandatory runtime dependencie
 
 However, some optional support model features need additional packages:
 * [`Pillow`](https://pillow.readthedocs.io/en/stable/) (module `PIL`) is a pouplar imaging library for Python. pypdfium2 provides convenience adapters to translate between raw bitmap buffers and PIL images. It also uses PIL for some command-line functionality (e.g. image saving).
-* [`NumPy`](https://numpy.org/doc/stable/index.html) is a library for scientific computing. Similar to `Pillow`, pypdfium2 provides helpers to get a numpy array view of a raw bitmap.
+* [`NumPy`](https://numpy.org/doc/stable/index.html) is a library for scientific computing. As with `Pillow`, pypdfium2 provides helpers to get a numpy array view of a raw bitmap.
 * [`opencv-python`](https://github.com/opencv/opencv-python) (module `cv2`) is an imaging library built around numpy arrays. It can be used in the rendering CLI to save with pypdfium2's numpy adapter.
 
 pypdfium2 tries to defer imports of optional dependencies until they are actually needed, so there should be no startup overhead if you don't use them.
@@ -649,7 +649,7 @@ Usage should be largely self-explanatory, assuming a minimum of familiarity with
 
 ## Licensing
 
-PDFium and pypdfium2 are available by the terms and conditions of either [`Apache-2.0`](LICENSES/Apache-2.0.txt) or [`BSD-3-Clause`](LICENSES/BSD-3-Clause.txt), at your choice.
+pypdfium2 is available by the terms and conditions of either [`Apache-2.0`](LICENSES/Apache-2.0.txt) or [`BSD-3-Clause`](LICENSES/BSD-3-Clause.txt), at your choice.
 Various other open-source licenses apply to dependencies bundled with PDFium. Verbatim copies of their respective licenses are contained in the file [`LicenseRef-PdfiumThirdParty.txt`](LICENSES/LicenseRef-PdfiumThirdParty.txt), which also has to be shipped with binary redistributions.
 Documentation and examples of pypdfium2 are licensed under [`CC-BY-4.0`](LICENSES/CC-BY-4.0.txt).
 
@@ -657,16 +657,13 @@ pypdfium2 complies with the [reuse standard](https://reuse.software/spec/) by in
 
 To the author's knowledge, pypdfium2 is one of the rare Python libraries that are capable of PDF rendering while not being covered by copyleft licenses (such as the `GPL`).[^liberal_pdf_renderlibs]
 
-As of early 2023, a single developer is author and rightsholder of the code base (apart from a few minor [code contributions](https://github.com/pypdfium2-team/pypdfium2/graphs/contributors)).
-
 [^liberal_pdf_renderlibs]: The only other liberal-licensed PDF rendering libraries known to the author are [`pdf.js`](https://github.com/mozilla/pdf.js/) (JavaScript) and [`Apache PDFBox`](https://github.com/apache/pdfbox) (Java), but python bindings packages don't exist yet or are unsatisfactory. However, we wrote some gists that show it'd be possible in principle: [pdfbox](https://gist.github.com/mara004/51c3216a9eabd3dcbc78a86d877a61dc) (+ [setup](https://gist.github.com/mara004/881d0c5a99b8444fd5d1d21a333b70f8)), [pdfjs](https://gist.github.com/mara004/87276da4f8be31c80c38036c6ab667d7).
 
 
-## Issues
+## Issues / Contributions
 
 While using pypdfium2, you might encounter bugs or missing features.
-In this case, feel free to open an issue or discuss thread. If applicable, include details such as tracebacks, OS and CPU type, as well as the versions of pypdfium2 and used dependencies.
-__However, please note our [response policy](#contributions).__
+In this case, feel free to open an issue or discussion thread. If applicable, include details such as tracebacks, OS and CPU type, as well as the versions of pypdfium2 and used dependencies.
 
 Roadmap:
 * pypdfium2
@@ -679,6 +676,13 @@ Roadmap:
 * [pdfium-binaries](https://github.com/bblanchon/pdfium-binaries/issues): Binary builder.
 * [ctypesgen](https://github.com/ctypesgen/ctypesgen/issues): Bindings generator.
 
+### Response policy
+<!-- Inspired by bluesky's contribution rules: https://github.com/bluesky-social/indigo -->
+
+Given this is a volunteer open-source project, it is possible you may not get a response to your issue, or it may be closed without much feedback. Conversations may be locked if we feel like our attention is getting DDOSed. We may not have time to provide usage support.
+
+The same applies to Pull Requests. We will accept contributions only if we find them suitable. Do not reach out with a strong expectation to get your change merged; it is solely up to the repository owner to decide if and when a PR will be merged, and we are free to silently reject PRs we do not like.
+
 ### Known limitations
 
 #### Risk of unknown object lifetime violations
@@ -704,17 +708,6 @@ Also, while ABI bindings tend to be more convenient, they have some technical dr
 ## Development
 <!-- TODO wheel tags, maintainer access, GitHub peculiarities -->
 
-### Contributions
-<!-- Inspired by bluesky's contribution rules: https://github.com/bluesky-social/indigo -->
-
-> We may accept contributions, but only if our code quality expectations are met.
-
-__Policy__:
-* We may not respond to your issue or PR.
-* We may close an issue or PR without much feedback.
-* We may lock discussions or contributions if our attention is getting DDOSed.
-* We may not provide much usage support.
-
 ### Long lines
 
 The pypdfium2 codebase does not hard wrap long lines.
@@ -877,7 +870,7 @@ Inspired by *wowpng*, the first known proof of concept Python binding to PDFium
 *pypdfium-reboot* then added a script to automate binary deployment and bindings generation to simplify regular updates. However, it was still not platform specific.
 
 pypdfium2 is a full rewrite of *pypdfium-reboot* to build platform-specific wheels and consolidate the setup scripts. Further additions include ...
-* A CI workflow to automatically release new wheels every Tuesday
-* Support models that conveniently wrap the raw PDFium/ctypes API
+* A CI workflow to automatically release new wheels at a defined schedule
+* Convenience support models that wrap the raw PDFium/ctypes API
 * Test code
 * A script to build PDFium from source
diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 7cd089d6c..89053d6e6 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -15,7 +15,7 @@
   * Renamed `PdfObject.get_pos()` to `.get_bounds()`.
   * Renamed `PdfImage.get_size()` to `.get_px_size()`.
   * `PdfImage.extract()`: Removed `fb_render` param because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place.
-- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest.
+- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest is None` and a dest with unknown mode.
 - `get_text_range()`: Removed implicit translation of default calls to `get_text_bounded()`, as pdfium reverted `FPDFText_GetText()` to UCS-2, which resolves the allocation concern. However, callers are encouraged to explicitly use `get_text_bounded()` for full Unicode support.
 - Removed legacy version flags.
 
@@ -27,7 +27,7 @@
 - If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype.
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
 - Improved startup performance by deferring imports of optional dependencies to the point where they are actually needed, to avoid overhead if you do not use them.
-- Simplified version impl (no API change expected).
+- Simplified version classes (no API change expected).
 
 *Project*
 - Merged `tests_old/` back into `tests/`.
diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 2db39f791..b87233f2a 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -259,8 +259,8 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images)
             else:
                 dst_image = dst_image.filter(cls._get_linv_lut())
             if exclude_images:
-                # FIXME Not sure how to translate XObject to page coordinates. pdfium does not seem to provide an API for this, so we're currently unable to descend into XObjects.
-                # FIXME We'd also like to take into account alpha masks, but this may be difficult as long as pdfium does not expose them directly.
+                # FIXME pdfium does not seem to provide APIs to translate XObject to page coordinates, so not sure how to handle images nested in XObjects.
+                # FIXME we'd also like to take alpha masks into account, but this may be difficult as long as pdfium does not expose them directly.
                 image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1))
                 if len(image_objs) > 0:
                     mask = PIL.Image.new("1", src_image.size)
diff --git a/src/pypdfium2/_utils.py b/src/pypdfium2/_utils.py
index a1c459281..d968f1c7f 100644
--- a/src/pypdfium2/_utils.py
+++ b/src/pypdfium2/_utils.py
@@ -7,7 +7,7 @@
 
 def deferred_import(modpath):
     
-    # FIXME If modpath points to a submodule, the parent module will be loaded immediately when this function is called. This is a limitation of the find_spec() importlib API used here. However, this may still be useful if the parent is a mere namespace package that does not contain anything expensive, as in the case of PIL.
+    # FIXME If modpath points to a submodule, the parent module will be loaded immediately when this function is called, which is a limitation of the find_spec() importlib API used here. However, this may still be useful if the parent is a mere namespace package that does not contain anything expensive, as in the case of PIL.
     
     module = sys.modules.get(modpath, None)
     if module is not None:
diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py
index 78006c11b..43550d423 100644
--- a/src/pypdfium2/version.py
+++ b/src/pypdfium2/version.py
@@ -10,7 +10,7 @@
 import pypdfium2_raw
 
 
-class _abc_version:
+class _version_interface:
     
     def __init__(self):
         
@@ -45,7 +45,7 @@ def _craft_desc(self, suffix=[]):
         return desc
 
 
-class _version_pypdfium2 (_abc_version):
+class _version_pypdfium2 (_version_interface):
     
     _FILE = Path(__file__).parent / "version.json"
     _TAG_FIELDS = ("major", "minor", "patch")
@@ -64,7 +64,7 @@ def _hook(self):
             self.desc += "@editable"
 
 
-class _version_pdfium (_abc_version):
+class _version_pdfium (_version_interface):
     
     _FILE = Path(pypdfium2_raw.__file__).parent / "version.json"
     _TAG_FIELDS = ("major", "minor", "build", "patch")

From d3e9a43da4e2542bd66923f262cd0c6824201657 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 16 Jul 2024 14:18:30 +0200
Subject: [PATCH 110/140] readme: slightly update wording in raw api guide

---
 README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 2c9f4b1f5..09aae76de 100644
--- a/README.md
+++ b/README.md
@@ -332,14 +332,14 @@ Here are some examples of using the support model API.
 
 ### Raw PDFium API
 
-While helper classes conveniently wrap the raw PDFium API, it may still be accessed directly and is available in the namespace `pypdfium2.raw`. Lower-level helpers that may aid with using the raw API are provided in `pypdfium2.internal`.
+While helper classes conveniently wrap the raw PDFium API, it may still be accessed directly and is available in the namespace `pypdfium2.raw`. Lower-level utilities that may aid with using the raw API are provided in `pypdfium2.internal`.
 
 ```python
 import pypdfium2.raw as pdfium_c
 import pypdfium2.internal as pdfium_i
 ```
 
-Since PDFium is a large library, many components are not covered by helpers yet. You may seamlessly interact with the raw API while still using helpers where available. When used as ctypes function parameter, helper objects automatically resolve to the underlying raw object (but you may still access it explicitly if desired):
+Since PDFium is a large library, many components are not covered by helpers yet. However, as helpers expose their underlying raw objects, you may seamlessly integrate raw APIs while using helpers as available. When passed as ctypes function parameter, helpers automatically resolve to the raw object handle (but you may still access it explicitly if desired):
 ```python
 permission_flags = pdfium_c.FPDF_GetDocPermission(pdf.raw)  # explicit
 permission_flags = pdfium_c.FPDF_GetDocPermission(pdf)      # implicit
@@ -347,14 +347,14 @@ permission_flags = pdfium_c.FPDF_GetDocPermission(pdf)      # implicit
 
 For PDFium docs, please look at the comments in its [public header files](https://pdfium.googlesource.com/pdfium/+/refs/heads/main/public/).[^pdfium_docs]
 A large variety of examples on how to interface with the raw API using [`ctypes`](https://docs.python.org/3/library/ctypes.html) is already provided with [support model source code](src/pypdfium2/_helpers).
-Nonetheless, the following guide may be helpful to get started with the raw API, especially for developers who are not familiar with `ctypes` yet.
+Nonetheless, the following guide may be helpful to get started with the raw API, if you are not familiar with `ctypes` yet.
 
 [^pdfium_docs]: Unfortunately, no recent HTML-rendered docs are available for PDFium at the moment.
 
 <!-- TODO write something about weakref.finalize(); add example on creating a C page array -->
 
 * In general, PDFium functions can be called just like normal Python functions.
-  However, parameters may only be passed positionally, i. e. it is not possible to use keyword arguments.
+  However, parameters may only be passed positionally, i.e. it is not possible to use keyword arguments.
   There are no defaults, so you always need to provide a value for each argument.
   ```python
   # arguments: filepath (bytes), password (bytes|None)
@@ -369,12 +369,12 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
       FPDF_LoadDocument.argtypes = [FPDF_STRING, FPDF_BYTESTRING]
       FPDF_LoadDocument.restype = FPDF_DOCUMENT
   ```
-  Python `bytes` are converted to `FPDF_STRING` by ctypes autoconversion.
+  Python `bytes` are converted to `FPDF_STRING` (which is an alias to `POINTER(c_char)`, rps. `char*` in C notation) by ctypes autoconversion.
   When passing a string to a C function, it must always be null-terminated, as the function merely receives a pointer to the first item and then continues to read memory until it finds a null terminator.
   
 [^bindings_decl]: From the auto-generated bindings file. We maintain a reference copy at `autorelease/bindings.py`. Or if you have an editable install, there will also be `src/pypdfium2_raw/bindings.py`.
 
-* While some functions are quite easy to use, things soon get more complex.
+* While some functions are quite easy to use, things may soon get more peculiar.
   First of all, function parameters are not only used for input, but also for output:
   ```python
   # Initialise an integer object (defaults to 0)
@@ -406,7 +406,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
   ```
 
 * For string output parameters, callers needs to provide a sufficiently long, pre-allocated buffer.
-  This may work differently depending on what type the function requires, which encoding is used, whether the number of bytes or characters is returned, and whether space for a null terminator is included or not. Carefully review the documentation for the function in question to fulfill its requirements.
+  This may work differently depending on what type the function requires, which encoding is used, whether the number of bytes or characters is returned, and whether space for a null terminator is included or not. Carefully review the documentation of the function in question to fulfill its requirements.
   
   Example A: Getting the title string of a bookmark.
   ```python
@@ -446,8 +446,8 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
 
 * Not only are there different ways of string output that need to be handled according to the requirements of the function in question.
   String input, too, can work differently depending on encoding and type.
-  We have already discussed `FPDF_LoadDocument()`, which takes a UTF-8 encoded string as `char *`.
-  A different examples is `FPDFText_FindStart()`, which needs a UTF-16LE encoded string, given as `unsigned short *`:
+  We have already discussed `FPDF_LoadDocument()`, which takes a UTF-8 encoded string as `char*`.
+  A different examples is `FPDFText_FindStart()`, which needs a UTF-16LE encoded string, given as `unsigned short*`:
   ```python
   # (Assuming `text` is a str and `textpage` an FPDF_TEXTPAGE)
   # Add the null terminator and encode as UTF-16LE
@@ -459,7 +459,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
 
 * Leaving strings, let's suppose you have a C memory buffer allocated by PDFium and wish to read its data.
   PDFium will provide you with a pointer to the first item of the byte array.
-  To access the data, you'll want to re-interpret the pointer using `ctypes.cast()` to encompass the whole array:
+  To access the data, you'll want to re-interpret the pointer with `ctypes.cast()` to encompass the whole array:
   ```python
   # (Assuming `bitmap` is an FPDF_BITMAP and `size` is the expected number of bytes in the buffer)
   buffer_ptr = pdfium_c.FPDFBitmap_GetBuffer(bitmap)
@@ -480,7 +480,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
   n_bytes = py_buffer.readinto(buffer_ptr.contents)  # returns the number of bytes read
   ```
 
-* If you wish to check whether two objects returned by PDFium are the same, the `is` operator won't help because `ctypes` does not have original object return (OOR), i. e. new, equivalent Python objects are created each time, although they might represent one and the same C object.[^ctypes_no_oor]
+* If you wish to check whether two objects returned by PDFium are the same, the `is` operator won't help because `ctypes` does not have original object return (OOR), i.e. new, equivalent Python objects are created each time, although they might represent one and the same C object.[^ctypes_no_oor]
   That's why you'll want to use `ctypes.addressof()` to get the memory addresses of the underlying C object.
   For instance, this is used to avoid infinite loops on circular bookmark references when iterating through the document outline:
   ```python
@@ -504,7 +504,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
   
   [^callback_usecases]: e. g. incremental read/write, management of progressive tasks, ...
   
-  Example: Loading a document from a Python buffer. This way, file access can be controlled in Python while the whole data does not need to be in memory at once.
+  Example: Loading a document from a Python buffer. This way, file access can be controlled in Python while the data does not need to be in memory at once.
   ```python
   import os
   
@@ -542,7 +542,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
 
 * When using the raw API, special care needs to be taken regarding object lifetime, considering that Python may garbage collect objects as soon as their reference count reaches zero. However, the interpreter has no way of magically knowing how long the underlying resources of a Python object might still be needed on the C side, so measures need to be taken to keep such objects referenced until PDFium does not depend on them anymore.
   
-  If resources need to remain valid after the time of a function call, PDFium docs usually indicate this clearly. Ignoring requirements on object lifetime will lead to memory corruption (commonly resulting in a segfault).
+  If resources need to remain valid after the time of a function call, PDFium docs usually indicate this clearly. Ignoring requirements on object lifetime will lead to memory corruption (commonly resulting in a segfault sooner or later).
   
   For instance, the docs on `FPDF_LoadCustomDocument()` state that
   > The application must keep the file resources |pFileAccess| points to valid until the returned FPDF_DOCUMENT is closed. |pFileAccess| itself does not need to outlive the FPDF_DOCUMENT.

From 86bc8b19173eb179a60d8b9d5a85bf5dbb5d4b8d Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Tue, 16 Jul 2024 14:38:52 +0200
Subject: [PATCH 111/140] Add reference to VikParuchuri's `pdftext`

---
 src/pypdfium2/_helpers/textpage.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py
index 19cdb4ac6..64e4b33cb 100644
--- a/src/pypdfium2/_helpers/textpage.py
+++ b/src/pypdfium2/_helpers/textpage.py
@@ -19,6 +19,11 @@ class PdfTextPage (pdfium_i.AutoCloseable):
     """
     Text page helper class.
     
+    Hint:
+        (py)pdfium itself does not implement layout analysis, such as detecting words/lines/paragraphs.
+        However, there is a fancy third-party extension to pypdfium2 that fills this gap:
+        https://github.com/VikParuchuri/pdftext
+    
     Attributes:
         raw (FPDF_TEXTPAGE):
             The underlying PDFium textpage handle.

From f0dbf9c0bf5c729ccab7852227149e53e77d2691 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 21 Jul 2024 14:02:16 +0200
Subject: [PATCH 112/140] version: clean up trailer

---
 src/pypdfium2/version.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py
index 43550d423..860aae695 100644
--- a/src/pypdfium2/version.py
+++ b/src/pypdfium2/version.py
@@ -165,5 +165,3 @@ def _hook(self):
     flags (tuple[str]):
         Tuple of pdfium feature flags. Empty for default build. (V8, XFA) for pdfium-binaries V8 build.
 """
-
-# -----

From f33fa366ecf6792bbe969f2bafeed580fee7e64b Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 21 Jul 2024 18:10:55 +0200
Subject: [PATCH 113/140] readme: improve raw api

Avoid declaring UTF-16 as "2 bytes per character", because a visual
character could be composed of a surrogate pair of 4 bytes.
So this is not the number of visual characters, but the number of units,
where the number of bytes per unit corresponds to the size of the data
type used.
---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 09aae76de..0af837329 100644
--- a/README.md
+++ b/README.md
@@ -369,7 +369,7 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
       FPDF_LoadDocument.argtypes = [FPDF_STRING, FPDF_BYTESTRING]
       FPDF_LoadDocument.restype = FPDF_DOCUMENT
   ```
-  Python `bytes` are converted to `FPDF_STRING` (which is an alias to `POINTER(c_char)`, rps. `char*` in C notation) by ctypes autoconversion.
+  Python `bytes` are converted to `FPDF_STRING` by ctypes autoconversion. This works because `FPDF_STRING` is actually an alias to `POINTER(c_char)` (i.e. `char*`), which is a primitive pointer type.
   When passing a string to a C function, it must always be null-terminated, as the function merely receives a pointer to the first item and then continues to read memory until it finds a null terminator.
   
 [^bindings_decl]: From the auto-generated bindings file. We maintain a reference copy at `autorelease/bindings.py`. Or if you have an editable install, there will also be `src/pypdfium2_raw/bindings.py`.
@@ -411,14 +411,13 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
   Example A: Getting the title string of a bookmark.
   ```python
   # (Assuming `bookmark` is an FPDF_BOOKMARK)
-  # First call to get the required number of bytes (not characters!), including space for a null terminator
+  # First call to get the required number of bytes (not units!), including space for a null terminator
   n_bytes = pdfium_c.FPDFBookmark_GetTitle(bookmark, None, 0)
   # Initialise the output buffer
   buffer = ctypes.create_string_buffer(n_bytes)
   # Second call with the actual buffer
   pdfium_c.FPDFBookmark_GetTitle(bookmark, buffer, n_bytes)
-  # Decode to string, cutting off the null terminator
-  # Encoding: UTF-16LE (2 bytes per character)
+  # Decode to string, cutting off the null terminator (encoding: UTF-16LE)
   title = buffer.raw[:n_bytes-2].decode("utf-16-le")
   ```
   
@@ -427,16 +426,17 @@ Nonetheless, the following guide may be helpful to get started with the raw API,
   # (Assuming `textpage` is an FPDF_TEXTPAGE and the boundary variables are set)
   # Store common arguments for the two calls
   args = (textpage, left, top, right, bottom)
-  # First call to get the required number of characters (not bytes!) - a possible null terminator is not included
+  # First call to get the required number of units (not bytes!) - a possible null terminator is not included
   n_chars = pdfium_c.FPDFText_GetBoundedText(*args, None, 0)
   # If no characters were found, return an empty string
   if n_chars <= 0:
       return ""
-  # Calculate the required number of bytes (UTF-16LE encoding again)
+  # Calculate the required number of bytes (encoding: UTF-16LE again)
+  # The function signature uses c_ushort, so 1 unit takes sizeof(c_ushort) == 2 bytes
   n_bytes = 2 * n_chars
   # Initialise the output buffer - this function can work without null terminator, so skip it
   buffer = ctypes.create_string_buffer(n_bytes)
-  # Re-interpret the type from char to unsigned short as required by the function
+  # Re-interpret the type from char to unsigned short* as required by the function
   buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))
   # Second call with the actual buffer
   pdfium_c.FPDFText_GetBoundedText(*args, buffer_ptr, n_chars)

From c2aa668d5252ea911b4350d5bb81bae809fd1325 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 21 Jul 2024 18:32:34 +0200
Subject: [PATCH 114/140] Update a few docstrings

---
 src/pypdfium2/_helpers/document.py | 12 ++++++------
 src/pypdfium2/_helpers/page.py     |  2 +-
 src/pypdfium2/_helpers/textpage.py |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index abff23207..6511c2f96 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -317,7 +317,7 @@ def count_attachments(self):
     def get_attachment(self, index):
         """
         Returns:
-            PdfAttachment: The attachment at *index* (zero-based).
+            PdfAttachment: The attachment at given index (zero-based).
         """
         raw_attachment = pdfium_c.FPDFDoc_GetAttachment(self, index)
         if not raw_attachment:
@@ -345,7 +345,7 @@ def new_attachment(self, name):
     
     def del_attachment(self, index):
         """
-        Unlink the attachment at *index* (zero-based).
+        Unlink the attachment at given index (zero-based).
         It will be hidden from the viewer, but is still present in the file (as of PDFium 5418).
         Following attachments shift one slot to the left in the array representation used by PDFium's API.
         
@@ -360,7 +360,7 @@ def del_attachment(self, index):
     def get_page(self, index):
         """
         Returns:
-            PdfPage: The page at *index* (zero-based).
+            PdfPage: The page at given index (zero-based).
         Note:
             This calls ``FORM_OnAfterLoadPage()`` if the document has an active form env.
             In that case, note that closing the formenv would implicitly close the page.
@@ -406,7 +406,7 @@ def new_page(self, width, height, index=None):
     
     def del_page(self, index):
         """
-        Remove the page at *index* (zero-based).
+        Remove the page at given index (zero-based).
         It is recommended to close any open handles to the page before calling this method.
         """
         # FIXME not sure how pdfium would behave if the caller tries to access a handle to a deleted page...
@@ -447,7 +447,7 @@ def import_pages(self, pdf, pages=None, index=None):
     def get_page_size(self, index):
         """
         Returns:
-            (float, float): Width and height in PDF canvas units of the page at *index* (zero-based).
+            (float, float): Width and height of the page at given index (zero-based), in PDF canvas units.
         """
         size = pdfium_c.FS_SIZEF()
         ok = pdfium_c.FPDF_GetPageSizeByIndexF(self, index, size)
@@ -459,7 +459,7 @@ def get_page_size(self, index):
     def get_page_label(self, index):
         """
         Returns:
-            str: Label of the page at *index* (zero-based).
+            str: Label of the page at given index (zero-based).
             (A page label is essentially an alias that may be displayed instead of the page number.)
         """
         n_bytes = pdfium_c.FPDF_GetPageLabel(self, index, None, 0)
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index f992c723c..59342af44 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -202,7 +202,7 @@ def insert_obj(self, pageobj):
         """
         Insert a pageobject into the page.
         
-        The pageobject must not belong to a page yet. If it belongs to a PDF, this page must be part of the PDF.
+        The pageobject must not belong to a page yet. If it belongs to a PDF, the target page must be part of that PDF.
         
         Position and form are defined by the object's matrix.
         If it is the identity matrix, the object will appear as-is on the bottom left corner of the page.
diff --git a/src/pypdfium2/_helpers/textpage.py b/src/pypdfium2/_helpers/textpage.py
index 64e4b33cb..22dbc36a8 100644
--- a/src/pypdfium2/_helpers/textpage.py
+++ b/src/pypdfium2/_helpers/textpage.py
@@ -21,7 +21,7 @@ class PdfTextPage (pdfium_i.AutoCloseable):
     
     Hint:
         (py)pdfium itself does not implement layout analysis, such as detecting words/lines/paragraphs.
-        However, there is a fancy third-party extension to pypdfium2 that fills this gap:
+        However, there is a fancy third-party extension that fills this gap:
         https://github.com/VikParuchuri/pdftext
     
     Attributes:
@@ -43,7 +43,7 @@ def parent(self):  # AutoCloseable hook
     
     def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore"):
         """
-        Extract text from given boundaries in PDF coordinates.
+        Extract text from given boundaries, in PDF canvas units.
         If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
         
         Parameters:

From eb8b1b523545b61fdd1bf1ec5adc33fbb2eb59c9 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 21 Jul 2024 19:02:19 +0200
Subject: [PATCH 115/140] Rename "byte buffer" to "byte stream"

---
 src/pypdfium2/_helpers/document.py    | 8 ++++----
 src/pypdfium2/_helpers/pageobjects.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index 6511c2f96..a7b775e57 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -24,13 +24,13 @@ class PdfDocument (pdfium_i.AutoCloseable):
     
     Parameters:
         input_data (str | pathlib.Path | bytes | ctypes.Array | typing.BinaryIO | FPDF_DOCUMENT):
-            The input PDF given as file path, bytes, ctypes array, byte buffer, or raw PDFium document handle.
-            A byte buffer is defined as an object that implements ``seek() tell() read() readinto()``.
+            The input PDF given as file path, bytes, ctypes array, byte stream, or raw PDFium document handle.
+            A byte stream is defined as an object that implements ``seek() tell() read() readinto()``.
         password (str | None):
             A password to unlock the PDF, if encrypted. Otherwise, None or an empty string may be passed.
             If a password is given but the PDF is not encrypted, it will be ignored (as of PDFium 5418).
         autoclose (bool):
-            Whether byte buffer input should be automatically closed on finalization.
+            Whether byte stream input should be automatically closed on finalization.
     
     Raises:
         PdfiumError: Raised if the document failed to load. The exception is annotated with the reason reported by PDFium (via message and :attr:`~.PdfiumError.err_code`).
@@ -219,7 +219,7 @@ def save(self, dest, version=None, flags=pdfium_c.FPDF_NO_INCREMENTAL):
         
         Parameters:
             dest (str | pathlib.Path | io.BytesIO):
-                File path or byte buffer the document shall be written to.
+                File path or byte stream the document shall be written to.
             version (int | None):
                 The PDF version to use, given as an integer (14 for 1.4, 15 for 1.5, ...).
                 If None (the default), PDFium will set a version automatically.
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 084babb60..01bf7768b 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -212,7 +212,7 @@ def load_jpeg(self, source, pages=None, inline=False, autoclose=True):
         
         Parameters:
             source (str | pathlib.Path | typing.BinaryIO):
-                Input JPEG, given as file path or readable byte buffer.
+                Input JPEG, given as file path or readable byte stream.
             pages (list[PdfPage] | None):
                 If replacing an image, pass in a list of loaded pages that might contain it, to update their cache.
                 (The same image may be shown multiple times in different transforms across a PDF.)
@@ -230,7 +230,7 @@ def load_jpeg(self, source, pages=None, inline=False, autoclose=True):
         elif pdfium_i.is_buffer(source, "r"):
             buffer = source
         else:
-            raise ValueError(f"Cannot load JPEG from {source} - not a file path or byte buffer.")
+            raise ValueError(f"Cannot load JPEG from {source} - not a file path or byte stream.")
         
         bufaccess, to_hold = pdfium_i.get_bufreader(buffer)
         loader = {
@@ -341,7 +341,7 @@ def get_filters(self, skip_simple=False):
     
     def extract(self, dest, *args, **kwargs):
         """
-        Extract the image into an independently usable file or byte buffer, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits.
+        Extract the image into an independently usable file or byte stream, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits.
         
         This method can only extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly.
         Otherwise, the pixel data is decoded and re-encoded using :mod:`PIL`, which is slower and loses the original encoding.
@@ -355,7 +355,7 @@ def extract(self, dest, *args, **kwargs):
         
         Parameters:
             dest (str | pathlib.Path | io.BytesIO):
-                File path prefix or byte buffer to which the image shall be written.
+                File path prefix or byte stream to which the image shall be written.
             fb_format (str):
                 The image format to use in case it is necessary to (re-)encode the data.
         """

From d29435db98d2ea049688d81eea0c23f155b7060d Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 24 Jul 2024 17:25:05 +0200
Subject: [PATCH 116/140] doc nits

The TODO is pointless because closing a pageobject that is part of a
page would be a no-op, these are managed by pdfium anyway.
---
 src/pypdfium2/_helpers/bitmap.py   | 2 +-
 src/pypdfium2/_helpers/document.py | 6 +++---
 src/pypdfium2/_helpers/page.py     | 2 --
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index a67c3c6d3..10e5389e1 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -23,7 +23,7 @@ class PdfBitmap (pdfium_i.AutoCloseable):
     
     Warning:
         ``bitmap.close()``, which frees the buffer of foreign bitmaps, is not validated for safety.
-        A bitmap must not be closed when other objects still depend on its buffer!
+        A bitmap must not be closed while other objects still depend on its buffer!
     
     Attributes:
         raw (FPDF_BITMAP):
diff --git a/src/pypdfium2/_helpers/document.py b/src/pypdfium2/_helpers/document.py
index a7b775e57..4805f7a05 100644
--- a/src/pypdfium2/_helpers/document.py
+++ b/src/pypdfium2/_helpers/document.py
@@ -38,7 +38,7 @@ class PdfDocument (pdfium_i.AutoCloseable):
     
     Hint:
         * Documents may be used in a ``with``-block, closing the document on context manager exit.
-          This is recommended when *input_data* is a file path, to safely and immediately release the opened file handle.
+          This is recommended when *input_data* is a file path, to safely and immediately release the bound file handle.
         * :func:`len` may be called to get a document's number of pages.
         * Pages may be loaded using list index access.
         * Looping over a document will yield its pages from beginning to end.
@@ -608,8 +608,8 @@ def as_pageobject(self):
         """
         Returns:
             PdfObject: An independent pageobject representation of the XObject.
-            If multiple pageobjects are created from one XObject, they share resources.
-            Pageobjects created from an XObject remain valid after the XObject is closed.
+            If multiple pageobjects are created from an XObject, they share resources.
+            Returned pageobjects remain valid after the XObject is closed.
         """
         raw_pageobj = pdfium_c.FPDF_NewFormObjectFromXObject(self)
         # not a child object (see above)
diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index 59342af44..cd451a67d 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -281,8 +281,6 @@ def get_objects(self, filter=None, max_depth=15, form=None, level=0):
             :class:`.PdfObject`: A pageobject.
         """
         
-        # TODO close skipped objects explicitly ?
-        
         if form:
             count_objects = pdfium_c.FPDFFormObj_CountObjects
             get_object = pdfium_c.FPDFFormObj_GetObject

From f15ac1b5b80a648cfff8cfbff29805c0998b9706 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 25 Jul 2024 14:09:49 +0200
Subject: [PATCH 117/140] fix typo

---
 docs/devel/changelog_staging.md | 2 +-
 src/pypdfium2/version.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 89053d6e6..6e2352600 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -10,7 +10,7 @@
   * Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). Instead, use `PdfPage.render()` with a loop or process pool.
   * Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`.
   * `PdfBitmap.from_pil()`: Removed `recopy` param.
-  * Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark them" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion, as is now implemented in pypdfium2's rendering CLI.
+  * Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark theme" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion, as is now implemented in pypdfium2's rendering CLI.
 - Pageobjects
   * Renamed `PdfObject.get_pos()` to `.get_bounds()`.
   * Renamed `PdfImage.get_size()` to `.get_px_size()`.
diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py
index 860aae695..ec280f8dc 100644
--- a/src/pypdfium2/version.py
+++ b/src/pypdfium2/version.py
@@ -55,7 +55,7 @@ def _hook(self):
         self.tag = self._craft_tag()
         if self.beta is not None:
             self.tag += f"b{self.beta}"
-    
+        
         suffix = ["dirty"] if self.dirty else []
         self.desc = self._craft_desc(suffix)
         if self.data_source != "git":

From 4cda54c8c7d443bf1343832fc33bd9784f8a2056 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 1 Aug 2024 17:54:29 +0200
Subject: [PATCH 118/140] Update to new FPDFPageObj_TransformF()

https://pdfium-review.googlesource.com/c/pdfium/+/121630
---
 src/pypdfium2/_helpers/pageobjects.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 01bf7768b..7565b1598 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -145,7 +145,9 @@ def transform(self, matrix):
         Parameters:
             matrix (PdfMatrix): Multiply the pageobject's current transform matrix by this matrix.
         """
-        pdfium_c.FPDFPageObj_Transform(self, *matrix.get())
+        ok = pdfium_c.FPDFPageObj_TransformF(self, matrix)
+        if not ok:
+            raise PdfiumError("Failed to transform pageobject with matrix.")
 
 
 class PdfImage (PdfObject):

From 7cc3cbe2b16961976879b344126448f0f2d63253 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 1 Aug 2024 18:11:48 +0200
Subject: [PATCH 119/140] Fix caller-side imports of deferred modules

---
 src/pypdfium2/_utils.py | 49 +++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/src/pypdfium2/_utils.py b/src/pypdfium2/_utils.py
index d968f1c7f..48dd1099c 100644
--- a/src/pypdfium2/_utils.py
+++ b/src/pypdfium2/_utils.py
@@ -1,32 +1,39 @@
 # SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
 # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
 
+# see https://gist.github.com/mara004/6915e904797916b961e9c53b4fc874ec for alternative approaches to deferred imports
+
 import sys
-import importlib.util
+import importlib
+import functools
 
+if sys.version_info < (3, 8):
+    # NOTE This is not as good as a real cached property.
+    # https://github.com/penguinolog/backports.cached_property might be better.
+    def cached_property(func):
+        return property( functools.lru_cache(maxsize=1)(func) )
+else:
+    cached_property = functools.cached_property
 
-def deferred_import(modpath):
+
+class _DeferredModule:
     
-    # FIXME If modpath points to a submodule, the parent module will be loaded immediately when this function is called, which is a limitation of the find_spec() importlib API used here. However, this may still be useful if the parent is a mere namespace package that does not contain anything expensive, as in the case of PIL.
+    # NOTE Attribute assigment will affect only the wrapper, not the actual module.
     
-    module = sys.modules.get(modpath, None)
-    if module is not None:
-        return module  # shortcut
+    def __init__(self, modpath):
+        self._modpath = modpath
     
-    # assuming an optional dependency
-    # returning None will simply let it fail with an AttributeError when attempting to access the module
-    try:
-        spec = importlib.util.find_spec(modpath)
-    except ModuleNotFoundError:
-        return None
-    if spec is None:
-        return None
+    def __repr__(self):
+        return f"<deferred module wrapper {self._modpath!r}>"
     
-    # see https://docs.python.org/3/library/importlib.html#implementing-lazy-imports
-    loader = importlib.util.LazyLoader(spec.loader)
-    spec.loader = loader
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[modpath] = module
-    loader.exec_module(module)
+    @cached_property
+    def _module(self):
+        # print("actually importing module...")
+        return importlib.import_module(self._modpath)
     
-    return module
+    def __getattr__(self, k):
+        return getattr(self._module, k)
+
+
+def deferred_import(modpath):
+    return _DeferredModule(modpath)

From bbc7f98b01f1306794502c0251fc2960dcf0926e Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 11 Aug 2024 21:26:56 +0200
Subject: [PATCH 120/140] `PdfMatrix.mirror()`: Fix misleading terminology

see changelog entry
---
 docs/devel/changelog_staging.md  |  1 +
 src/pypdfium2/_helpers/matrix.py | 10 ++++++----
 tests/test_nup.py                |  6 +++---
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 6e2352600..16b5b14d4 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -15,6 +15,7 @@
   * Renamed `PdfObject.get_pos()` to `.get_bounds()`.
   * Renamed `PdfImage.get_size()` to `.get_px_size()`.
   * `PdfImage.extract()`: Removed `fb_render` param because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place.
+- Renamed misleading `PdfMatrix.mirror()` parameters `v, h` to `invert_x, invert_y`, as the terms horizontal/vertical flip commonly refer to the transformation applied, not the axis around which is being flipped (i.e. the previous `v` meant flipping around the Y axis, which is vertical, but the resulting transform is inverting the X coordinates and thus actually horizontal). No behavior change if you did not use keyword arguments.
 - `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest is None` and a dest with unknown mode.
 - `get_text_range()`: Removed implicit translation of default calls to `get_text_bounded()`, as pdfium reverted `FPDFText_GetText()` to UCS-2, which resolves the allocation concern. However, callers are encouraged to explicitly use `get_text_bounded()` for full Unicode support.
 - Removed legacy version flags.
diff --git a/src/pypdfium2/_helpers/matrix.py b/src/pypdfium2/_helpers/matrix.py
index 9ba1de292..935a5784d 100644
--- a/src/pypdfium2/_helpers/matrix.py
+++ b/src/pypdfium2/_helpers/matrix.py
@@ -127,13 +127,15 @@ def rotate(self, angle, ccw=False, rad=False):
         return self.multiply( PdfMatrix(c, s, -s, c) if ccw else PdfMatrix(c, -s, s, c) )
     
     
-    def mirror(self, v, h):
+    def mirror(self, invert_x, invert_y):
         """
         Parameters:
-            v (bool): Whether to mirror vertically (at the Y axis).
-            h (bool): Whether to mirror horizontally (at the X axis).
+            invert_x (bool): If True, invert X coordinates (horizontal transform). Corresponds to flipping around the Y axis.
+            invert_y (bool): If True, invert Y coordinates (vertical transform). Corresponds to flipping around the X axis.
+        Note:
+            Flipping around a vertical axis leads to a horizontal transform, and vice versa.
         """
-        return self.scale(x=(-1 if v else 1), y=(-1 if h else 1))
+        return self.scale(x=(-1 if invert_x else 1), y=(-1 if invert_y else 1))
     
     
     def skew(self, x_angle, y_angle, rad=False):
diff --git a/tests/test_nup.py b/tests/test_nup.py
index 5a20f1b91..270bb2908 100644
--- a/tests/test_nup.py
+++ b/tests/test_nup.py
@@ -37,21 +37,21 @@ def test_xobject_placement():
     assert pytest.approx(pos_a, abs=0.5) == (19, 440, 279, 823)
     
     po = xobject.as_pageobject()
-    matrix = base_matrix.mirror(v=True, h=False).translate(w, 0).translate(w, h)
+    matrix = base_matrix.mirror(invert_x=True, invert_y=False).translate(w, 0).translate(w, h)
     assert matrix == pdfium.PdfMatrix(-0.5, 0, 0, 0.5, 2*w, h)
     po.transform(matrix)
     dest_page_1.insert_obj(po)
     
     po = xobject.as_pageobject()
     assert po.get_matrix() == pdfium.PdfMatrix()
-    matrix = base_matrix.mirror(v=False, h=True).translate(0, h).translate(w, 0)
+    matrix = base_matrix.mirror(invert_x=False, invert_y=True).translate(0, h).translate(w, 0)
     assert matrix == pdfium.PdfMatrix(0.5, 0, 0, -0.5, w, h)
     po.set_matrix(matrix)
     assert po.get_matrix() == matrix
     dest_page_1.insert_obj(po)
     
     po = xobject.as_pageobject()
-    matrix = base_matrix.mirror(v=True, h=True).translate(w, h)
+    matrix = base_matrix.mirror(invert_x=True, invert_y=True).translate(w, h)
     assert matrix == pdfium.PdfMatrix(-0.5, 0, 0, -0.5, w, h)
     po.set_matrix(matrix)
     dest_page_1.insert_obj(po)

From 98ed5365934096d817ed61b7d63893e8bad43d43 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 11 Aug 2024 21:32:59 +0200
Subject: [PATCH 121/140] changelog: explicitly mention previous `_flatten()`

---
 docs/devel/changelog_staging.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index 16b5b14d4..a59a974ef 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -23,7 +23,7 @@
 *Improvements and new features*
 - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates.
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
-- Exposed `PdfPage.flatten()` (previously non-public helper), after having found out how to correctly use it. Added check and updated docs accordingly.
+- Exposed `PdfPage.flatten()` (previously semi-private `_flatten()`), after having found out how to correctly use it. Added check and updated docs accordingly.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
 - If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype.
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.

From ee2f03593a90db713d710dd47ed520b211b1c038 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 26 Aug 2024 21:44:46 +0200
Subject: [PATCH 122/140] changelog nit

---
 docs/devel/changelog_staging.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index a59a974ef..f77d83a7b 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -24,7 +24,7 @@
 - Added `PdfPosConv` helper and `PdfBitmap.get_posconv(page)` for bidirectional translation between page and bitmap coordinates.
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
 - Exposed `PdfPage.flatten()` (previously semi-private `_flatten()`), after having found out how to correctly use it. Added check and updated docs accordingly.
-- Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs.
+- Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released explicitly, given OS limits on the number of open FDs.
 - If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype.
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
 - Improved startup performance by deferring imports of optional dependencies to the point where they are actually needed, to avoid overhead if you do not use them.

From b3f78041f0e3f57e5152941a379f7bf6068ced22 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 19 Sep 2024 02:22:46 +0200
Subject: [PATCH 123/140] Update licensing docs

It is not clear to me if PDFium is "BSD-3-Clause OR Apache-2.0" or
"BSD-3-Clause AND Apache-2.0". The pypdfium2 codebase previously stated
"OR", but recently it hit me we don't actually have any evidence for
that.
In the end, I figured it was probably a presumption from the early days
of the project that might as well be wrong, and that "BSD-3-Clause AND
Apache-2.0" would have been the safer assumption. Sorry :(

IANAL, but to my understanding both licenses are liberal and in similar
spirit, so hopefully this should not have negative legal consequences
downstream.
Note that there is (and always was) ABSOLUTELY NO WARRANTY for any
information provided with the pypdfium2 project. For pypdfium2's Readme,
see the CC-BY-4.0 license (e.g. "Section 5 -- Disclaimer of Warranties
and Limitation of Liability."). For pypdfium2's code (including any
information provided therein), see the Apache-2.0 or BSD-3-Clause
licenses, which have similar disclaimers.

This patch avoids any "OR" or "AND", instead changing to a generic
comma. This is not valid SPDX/reuse syntax and serves as a placeholder
until we know better.

Note that pypdfium2's Python code continues to be "Apache-2.0 OR
BSD-3-Clause". This issue is only about PDFium itself.
---
 .reuse/dep5                    |  4 ++--
 .reuse/dep5-wheel              |  2 +-
 README.md                      | 12 +++++++++---
 conda/helpers/recipe/meta.yaml |  4 ++--
 conda/raw/recipe/meta.yaml     |  4 ++--
 setup.py                       |  3 +--
 6 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/.reuse/dep5 b/.reuse/dep5
index 552d0b4af..8caad1951 100644
--- a/.reuse/dep5
+++ b/.reuse/dep5
@@ -51,7 +51,7 @@ Files:
     tests/resources/attachments.pdf
     tests/resources/mona_lisa.jpg
 Copyright: 2022 PDFium developers
-License: BSD-3-Clause OR Apache-2.0
+License: BSD-3-Clause, Apache-2.0
 Comment:
     Obtained from:
     https://pdfium.googlesource.com/pdfium/+/refs/heads/main/testing/resources/bookmarks_circular.pdf
@@ -67,7 +67,7 @@ Files:
 Copyright:
     2022 PDFium developers
     2024 geisserml <geisserml@gmail.com>
-License: BSD-3-Clause OR Apache-2.0
+License: BSD-3-Clause, Apache-2.0
 
 Files: tests/resources/images.pdf
 Copyright:
diff --git a/.reuse/dep5-wheel b/.reuse/dep5-wheel
index 5bf3fe889..5b046468b 100644
--- a/.reuse/dep5-wheel
+++ b/.reuse/dep5-wheel
@@ -26,4 +26,4 @@ Copyright:
     2024 PDFium developers
     2024 Developers of projects mentioned in PdfiumThirdParty
     2024 Benoît Blanchon and pdfium-binaries contributors
-License: (Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty
+License: (BSD-3-Clause, Apache-2.0) AND LicenseRef-PdfiumThirdParty
diff --git a/README.md b/README.md
index 0af837329..e92ba80d6 100644
--- a/README.md
+++ b/README.md
@@ -649,11 +649,17 @@ Usage should be largely self-explanatory, assuming a minimum of familiarity with
 
 ## Licensing
 
-pypdfium2 is available by the terms and conditions of either [`Apache-2.0`](LICENSES/Apache-2.0.txt) or [`BSD-3-Clause`](LICENSES/BSD-3-Clause.txt), at your choice.
-Various other open-source licenses apply to dependencies bundled with PDFium. Verbatim copies of their respective licenses are contained in the file [`LicenseRef-PdfiumThirdParty.txt`](LICENSES/LicenseRef-PdfiumThirdParty.txt), which also has to be shipped with binary redistributions.
+*Important: This is NOT LEGAL ADVICE, and there is ABSOLUTELY NO WARRANTY for any information provided in this document or elsewhere in the pypdfium2 project, including earlier revisions.*
+
+pypdfium2 itself is available by the terms and conditions of [`Apache-2.0`](LICENSES/Apache-2.0.txt) / [`BSD-3-Clause`](LICENSES/BSD-3-Clause.txt).
 Documentation and examples of pypdfium2 are licensed under [`CC-BY-4.0`](LICENSES/CC-BY-4.0.txt).
 
-pypdfium2 complies with the [reuse standard](https://reuse.software/spec/) by including [SPDX](https://spdx.org/licenses/) headers in source files, and license information for data files in [`.reuse/dep5`](.reuse/dep5).
+PDFium is available under a BSD-style license that can be found in its [`LICENSE`](https://pdfium.googlesource.com/pdfium/+/refs/heads/main/LICENSE) file.
+Various other open-source licenses apply to dependencies bundled with PDFium. These also have to be shipped alongside binary redistributions. Copies of identified licenses are provided in [`LicenseRef-PdfiumThirdParty.txt`](LICENSES/LicenseRef-PdfiumThirdParty.txt).
+There is no guarantee of completeness, and pdfium's dependencies might change over time. Please do notify us if you think this misses a relevant license.
+
+pypdfium2 includes [SPDX](https://spdx.org/licenses/) headers in source files.
+License information for data files is provided in [`.reuse/dep5`](.reuse/dep5) as per the [`reuse` standard](https://reuse.software/spec/).
 
 To the author's knowledge, pypdfium2 is one of the rare Python libraries that are capable of PDF rendering while not being covered by copyleft licenses (such as the `GPL`).[^liberal_pdf_renderlibs]
 
diff --git a/conda/helpers/recipe/meta.yaml b/conda/helpers/recipe/meta.yaml
index 987ad2648..06ae182b1 100644
--- a/conda/helpers/recipe/meta.yaml
+++ b/conda/helpers/recipe/meta.yaml
@@ -51,10 +51,10 @@ about:
   description: |
     This package provides python helpers around pdfium.
     Dependants are suggested to pin to a major version, but any tighter pinning is discouraged since it increases the risk for conflicts, and would lock you out from future fixes.
-  license: Apache-2.0 OR BSD-3-Clause
+  license: BSD-3-Clause, Apache-2.0
   license_file:
-    - LICENSES/Apache-2.0.txt
     - LICENSES/BSD-3-Clause.txt
+    - LICENSES/Apache-2.0.txt
     - LICENSES/CC-BY-4.0.txt
   dev_url: https://github.com/pypdfium2-team/pypdfium2
   doc_url: https://pypdfium2.readthedocs.io
diff --git a/conda/raw/recipe/meta.yaml b/conda/raw/recipe/meta.yaml
index c0af3fdc0..89626dc8f 100644
--- a/conda/raw/recipe/meta.yaml
+++ b/conda/raw/recipe/meta.yaml
@@ -52,10 +52,10 @@ about:
   description: |
     This package provides raw ctypes bindings to pdfium.
     Important: DO NOT PIN to an exact version, as pypdfium2_raw itself pins pdfium-binaries to achieve ABI safety.
-  license: Apache-2.0 OR BSD-3-Clause
+  license: BSD-3-Clause, Apache-2.0
   license_file:
-    - LICENSES/Apache-2.0.txt
     - LICENSES/BSD-3-Clause.txt
+    - LICENSES/Apache-2.0.txt
     - LICENSES/CC-BY-4.0.txt
   dev_url: https://github.com/pypdfium2-team/pypdfium2
   doc_url: https://pypdfium2.readthedocs.io
diff --git a/setup.py b/setup.py
index 6660a2b76..743da7385 100644
--- a/setup.py
+++ b/setup.py
@@ -81,7 +81,7 @@ def run_setup(modnames, pl_name, pdfium_ver):
     kwargs = dict(
         name = "pypdfium2",
         description = "Python bindings to PDFium",
-        license = "Apache-2.0 OR BSD-3-Clause",
+        license = "BSD-3-Clause, Apache-2.0, PdfiumThirdParty",
         license_files = LICENSES_SHARED,
         python_requires = ">= 3.6",
         cmdclass = {},
@@ -132,7 +132,6 @@ def run_setup(modnames, pl_name, pdfium_ver):
         kwargs["package_data"]["pypdfium2_raw"] = [VersionFN, BindingsFN, libname]
         kwargs["cmdclass"]["bdist_wheel"] = bdist_factory(pl_name)
         kwargs["distclass"] = BinaryDistribution
-        kwargs["license"] = f"({kwargs['license']}) AND LicenseRef-PdfiumThirdParty"
         kwargs["license_files"] += LICENSES_WHEEL
     
     if "pypdfium2" in kwargs["package_data"]:

From ef0854e4931cff5dc2fcbb5ba6f2e24d95848231 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 19 Sep 2024 17:29:09 +0200
Subject: [PATCH 124/140] changelog: fix typo

---
 docs/devel/changelog_staging.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index f77d83a7b..c2c164438 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -25,7 +25,7 @@
 - Added `PdfObject.get_quad_points()` to get the corner points of an image or text object.
 - Exposed `PdfPage.flatten()` (previously semi-private `_flatten()`), after having found out how to correctly use it. Added check and updated docs accordingly.
 - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released explicitly, given OS limits on the number of open FDs.
-- If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programatticaly handle the error subtype.
+- If document loading failed, `err_code` is now assigned to the `PdfiumError` instance so callers may programmatically handle the error subtype.
 - Corrected some null pointer checks: we have to use `bool(ptr)` rather than `ptr is None`.
 - Improved startup performance by deferring imports of optional dependencies to the point where they are actually needed, to avoid overhead if you do not use them.
 - Simplified version classes (no API change expected).

From d54d0417c06603cafa124bb3c0e747869fed3794 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 19 Sep 2024 17:44:07 +0200
Subject: [PATCH 125/140] PdfPage.flatten(): add note regarding invalidation of
 handles

---
 src/pypdfium2/_helpers/page.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pypdfium2/_helpers/page.py b/src/pypdfium2/_helpers/page.py
index cd451a67d..9b9d28677 100644
--- a/src/pypdfium2/_helpers/page.py
+++ b/src/pypdfium2/_helpers/page.py
@@ -319,8 +319,8 @@ def flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY):
         Flatten form fields and annotations into page contents.
         
         Attention:
-            :meth:`~.PdfDocument.init_forms` must have been called on the parent pdf, before the page was retrieved, for this method to work.
-            In other words, :attr:`.PdfPage.formenv` must be non-null.
+            * :meth:`~.PdfDocument.init_forms` must have been called on the parent pdf, before the page was retrieved, for this method to work. In other words, :attr:`.PdfPage.formenv` must be non-null.
+            * Flattening may invalidate existing handles to the page, so you'll want to re-initialize them after flattening.
         
         Parameters:
             flag (int): PDFium flattening target (:attr:`FLAT_*`)

From 51d88994837a5673020dabdfe87694b257003d16 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 27 Oct 2024 00:20:59 +0200
Subject: [PATCH 126/140] `PdfBitmap.to_numpy()` Use 2d shape for
 single-channel bitmap

---
 docs/devel/changelog_staging.md  | 1 +
 src/pypdfium2/_helpers/bitmap.py | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md
index c2c164438..5309188d5 100644
--- a/docs/devel/changelog_staging.md
+++ b/docs/devel/changelog_staging.md
@@ -9,6 +9,7 @@
 - Rendering / Bitmap
   * Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). Instead, use `PdfPage.render()` with a loop or process pool.
   * Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`.
+  * `PdfBitmap.to_numpy()`: If the bitmap is single-channel (grayscale), use a 2d shape to avoid needlessly wrapping each pixel value in a list.
   * `PdfBitmap.from_pil()`: Removed `recopy` param.
   * Removed pdfium color scheme param from rendering, as it's not really useful: one can only set colors for certain object types, which are then forced on all instances of that type. This may flatten different colors into one, leading to a loss of visual information. To achieve a "dark theme" for light PDFs, we suggest to instead post-process rendered images with selective lightness inversion, as is now implemented in pypdfium2's rendering CLI.
 - Pageobjects
diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index 10e5389e1..597ac6947 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -197,7 +197,7 @@ def to_numpy(self):
         
         The array contains as many rows as the bitmap is high.
         Each row contains as many pixels as the bitmap is wide.
-        The length of each pixel corresponds to the number of channels.
+        Each pixel will be an array of values per channel, or just a value if there is only one channel.
         
         The resulting array is supposed to share memory with the original bitmap buffer,
         so changes to the buffer should be reflected in the array, and vice versa.
@@ -210,11 +210,11 @@ def to_numpy(self):
         
         array = numpy.ndarray(
             # layout: row major
-            shape = (self.height, self.width, self.n_channels),
+            shape = (self.height, self.width, self.n_channels) if self.n_channels > 1 else (self.height, self.width),
             dtype = ctypes.c_ubyte,
             buffer = self.buffer,
-            # number of bytes per item for each nesting level (outer->inner, i. e. row, pixel, value)
-            strides = (self.stride, self.n_channels, 1),
+            # number of bytes per item for each nesting level (outer->inner: row, pixel, value - or row, value for a single-channel bitmap)
+            strides = (self.stride, self.n_channels, 1) if self.n_channels > 1 else (self.stride, 1),
         )
         
         return array

From 7f12ceeac6e5e2ad0483e12b7cb64b897655255b Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Sun, 27 Oct 2024 17:38:30 +0100
Subject: [PATCH 127/140] version.py: minor cleanup

---
 src/pypdfium2/version.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py
index ec280f8dc..3bca4a666 100644
--- a/src/pypdfium2/version.py
+++ b/src/pypdfium2/version.py
@@ -81,15 +81,7 @@ def _hook(self):
             self.desc += f"@{self.origin}"
 
 
-# API
-
 PYPDFIUM_INFO = _version_pypdfium2()
-PDFIUM_INFO = _version_pdfium()
-
-
-# Docs
-
-PYPDFIUM_INFO = PYPDFIUM_INFO
 """
 pypdfium2 helpers version.
 
@@ -129,7 +121,7 @@ def _hook(self):
 """
 
 
-PDFIUM_INFO = PDFIUM_INFO
+PDFIUM_INFO = _version_pdfium()
 """
 PDFium version.
 

From 195ce71f2d5d3799adc838e879202b76ae33663d Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 30 Oct 2024 23:06:11 +0100
Subject: [PATCH 128/140] CLI(renderer/pageobjects): slightly improve code
 style

---
 src/pypdfium2/_cli/_parsers.py    | 16 +++++++++++++++-
 src/pypdfium2/_cli/pageobjects.py | 23 ++++++++++-------------
 src/pypdfium2/_cli/render.py      | 22 ++++++----------------
 3 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/src/pypdfium2/_cli/_parsers.py b/src/pypdfium2/_cli/_parsers.py
index abffe4e5d..6ff8bdfa6 100644
--- a/src/pypdfium2/_cli/_parsers.py
+++ b/src/pypdfium2/_cli/_parsers.py
@@ -3,8 +3,8 @@
 
 import os
 import sys
-import argparse
 import logging
+import argparse
 from pathlib import Path
 import pypdfium2._helpers as pdfium
 import pypdfium2.internal as pdfium_i
@@ -91,6 +91,20 @@ def get_input(args, init_forms=False, **kwargs):
     return pdf
 
 
+# dummy more_itertools.peekable().__bool__ alternative
+
+def _postpeek_generator(value, iterator):
+    yield value; yield from iterator
+
+def iterator_hasvalue(iterator):
+    try:
+        first_value = next(iterator)
+    except StopIteration:
+        return False, None
+    else:
+        return True, _postpeek_generator(first_value, iterator)
+
+
 if sys.version_info >= (3, 9):
     from argparse import BooleanOptionalAction
 
diff --git a/src/pypdfium2/_cli/pageobjects.py b/src/pypdfium2/_cli/pageobjects.py
index 933fe0ab8..7272d08f4 100644
--- a/src/pypdfium2/_cli/pageobjects.py
+++ b/src/pypdfium2/_cli/pageobjects.py
@@ -3,7 +3,6 @@
 
 # TODO test-confirm filter and info params
 
-from itertools import chain
 from collections import OrderedDict
 import pypdfium2._helpers as pdfium
 import pypdfium2.internal as pdfium_i
@@ -13,6 +12,7 @@
     add_n_digits,
     get_input,
     round_list,
+    iterator_hasvalue,
 )
 
 
@@ -43,7 +43,7 @@ def attach(parser):
     )
     parser.add_argument(
         "--info",
-        nargs = "*",
+        nargs = "+",
         type = str.lower,
         choices = INFO_PARAMS,
         default = INFO_PARAMS,
@@ -76,24 +76,21 @@ def main(args):
     if args.filter:
         args.filter = [pdfium_i.ObjectTypeToConst[t] for t in args.filter]
     
-    show_pos = (PARAM_POS in args.info)
-    show_imageinfo = (PARAM_IMGINFO in args.info)
-    total_count = 0
+    show_pos = PARAM_POS in args.info
+    show_imginfo = PARAM_IMGINFO in args.info
+    assert show_pos or show_imginfo
     
+    total_count = 0
     for i in args.pages:
         
         page = pdf[i]
-        obj_searcher = page.get_objects(args.filter, max_depth=args.max_depth)
-        # note, more_itertools.peekable() could handle this more elegantly
-        try:
-            first_obj = next(obj_searcher)
-        except StopIteration:
-            continue
+        hasvalue, obj_searcher = iterator_hasvalue( page.get_objects(args.filter, max_depth=args.max_depth) )
+        if not hasvalue: continue
         
         print(f"# Page {i+1}")
         count = 0
         
-        for obj in chain([first_obj], obj_searcher):
+        for obj in obj_searcher:
             
             pad_0 = "    " * obj.level
             pad_1 = pad_0 + "    "
@@ -106,7 +103,7 @@ def main(args):
                     quad_bounds = obj.get_quad_points()
                     print(pad_1 + f"Quad Points: {[round_list(p, args.n_digits) for p in quad_bounds]}")
             
-            if show_imageinfo and isinstance(obj, pdfium.PdfImage):
+            if show_imginfo and isinstance(obj, pdfium.PdfImage):
                 print(pad_1 + f"Filters: {obj.get_filters()}")
                 metadata = obj.get_metadata()
                 assert (metadata.width, metadata.height) == obj.get_px_size()
diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index b87233f2a..329d2a663 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -17,6 +17,7 @@
 from pypdfium2._cli._parsers import (
     add_input, get_input,
     setup_logging,
+    iterator_hasvalue,
     BooleanOptionalAction,
 )
 
@@ -288,37 +289,26 @@ def _saving_hook(self, out_path, bitmap, page, postproc_kwargs):
     @classmethod
     def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
         dst_image = src_image
-        
         if invert_lightness:
-            
             if bitmap.format == pdfium_c.FPDFBitmap_Gray:
                 dst_image = ~src_image
             else:
-                
-                if bitmap.rev_byteorder:
-                    convert_to = cv2.COLOR_RGB2HLS
-                    convert_from = cv2.COLOR_HLS2RGB
-                else:
-                    convert_to = cv2.COLOR_BGR2HLS
-                    convert_from = cv2.COLOR_HLS2BGR
-                
+                convert_to, convert_from = (cv2.COLOR_RGB2HLS, cv2.COLOR_HLS2RGB) if bitmap.rev_byteorder else (cv2.COLOR_BGR2HLS, cv2.COLOR_HLS2BGR)
                 dst_image = cv2.cvtColor(dst_image, convert_to)
                 h, l, s = cv2.split(dst_image)
                 l = ~l
                 dst_image = cv2.merge([h, l, s])
                 dst_image = cv2.cvtColor(dst_image, convert_from)
-            
             if exclude_images:
-                assert bitmap.format != pdfium_c.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2"
+                assert bitmap.format != pdfium_c.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2"  # FIXME?
                 posconv = bitmap.get_posconv(page)
-                image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1))
-                if len(image_objs) > 0:
+                have_images, obj_searcher = iterator_hasvalue( page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1) )
+                if have_images:
                     mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8)
-                    for obj in image_objs:
+                    for obj in obj_searcher:
                         qpoints = np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32)
                         cv2.fillPoly(mask, [qpoints], 1)
                     dst_image = cv2.copyTo(src_image, mask=mask, dst=dst_image)
-            
         return dst_image
 
 

From 5362127d9e534124881cf46cdd3bc1653ab0e187 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 25 Nov 2024 16:59:09 +0100
Subject: [PATCH 129/140] Fix some dirty code in pdfium build script

had two consecutive use_syslibs if-blocks that could be merged into one.
---
 setupsrc/pypdfium2_setup/build_pdfium.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/setupsrc/pypdfium2_setup/build_pdfium.py b/setupsrc/pypdfium2_setup/build_pdfium.py
index b2424bc4e..2306d6149 100755
--- a/setupsrc/pypdfium2_setup/build_pdfium.py
+++ b/setupsrc/pypdfium2_setup/build_pdfium.py
@@ -88,20 +88,18 @@ def dl_depottools(do_update):
 
 def dl_pdfium(GClient, do_update, revision):
     
-    is_sync = True
-    
     if PDFiumDir.exists():
         if do_update:
             print("PDFium: Revert / Sync  ...")
             run_cmd([GClient, "revert"], cwd=SBDir)
         else:
-            is_sync = False
             print("PDFium: Using existing repository as-is.")
     else:
         print("PDFium: Download ...")
+        do_update = True
         run_cmd([GClient, "config", "--custom-var", "checkout_configuration=minimal", "--unmanaged", PdfiumURL], cwd=SBDir)
     
-    if is_sync:
+    if do_update:
         # TODO consider passing -D ?
         run_cmd([GClient, "sync", "--revision", f"origin/{revision}", "--no-history", "--shallow"], cwd=SBDir)
         # quick & dirty fix to make a versioned commit available (pdfium gets tagged frequently, so this should be more than enough in practice)
@@ -109,7 +107,7 @@ def dl_pdfium(GClient, do_update, revision):
         run_cmd(["git", "fetch", "--depth=100"], cwd=PDFiumDir)
         run_cmd(["git", "fetch", "--depth=100"], cwd=PDFiumDir)
     
-    return is_sync
+    return do_update
 
 
 def _dl_unbundler():
@@ -245,21 +243,19 @@ def main(
     GN      = get_tool("gn")
     Ninja   = get_tool("ninja")
     
-    pdfium_dl_done = dl_pdfium(GClient, b_update, b_revision)
+    did_pdfium_sync = dl_pdfium(GClient, b_update, b_revision)
     v_short, v_post = identify_pdfium()
     print(f"Version {v_short} {v_post}", file=sys.stderr)
     
-    if pdfium_dl_done:
+    if did_pdfium_sync:
         patch_pdfium(v_short)
-    if b_use_syslibs:
-        _dl_unbundler()
-
-    if b_use_syslibs:
-        run_cmd(["python3", "build/linux/unbundle/replace_gn_files.py", "--system-libraries", "icu"], cwd=PDFiumDir)
     
     config_dict = DefaultConfig.copy()
     if b_use_syslibs:
+        _dl_unbundler()
+        run_cmd(["python3", "build/linux/unbundle/replace_gn_files.py", "--system-libraries", "icu"], cwd=PDFiumDir)
         config_dict.update(SyslibsConfig)
+    
     config_str = serialise_config(config_dict)
     print(f"\nBuild configuration:\n{config_str}\n")
     

From cc728db5823dcab29bdc5893375f1d5e7cd88582 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 27 Nov 2024 00:36:01 +0100
Subject: [PATCH 130/140] Consistently use iterator_hasvalue()

---
 src/pypdfium2/_cli/render.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 329d2a663..03e9f9012 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -262,11 +262,11 @@ def postprocess(cls, src_image, page, posconv, invert_lightness, exclude_images)
             if exclude_images:
                 # FIXME pdfium does not seem to provide APIs to translate XObject to page coordinates, so not sure how to handle images nested in XObjects.
                 # FIXME we'd also like to take alpha masks into account, but this may be difficult as long as pdfium does not expose them directly.
-                image_objs = list(page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1))
-                if len(image_objs) > 0:
+                have_images, obj_walker = iterator_hasvalue( page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1) )
+                if have_images:
                     mask = PIL.Image.new("1", src_image.size)
                     draw = PIL.ImageDraw.Draw(mask)
-                    for obj in image_objs:
+                    for obj in obj_walker:
                         qpoints = [posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()]
                         draw.polygon(qpoints, fill=1)
                     dst_image.paste(src_image, mask=mask)
@@ -302,10 +302,10 @@ def postprocess(cls, src_image, bitmap, page, invert_lightness, exclude_images):
             if exclude_images:
                 assert bitmap.format != pdfium_c.FPDFBitmap_BGRx, "Not sure how to paste with mask on {RGB,BGR}X image using cv2"  # FIXME?
                 posconv = bitmap.get_posconv(page)
-                have_images, obj_searcher = iterator_hasvalue( page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1) )
+                have_images, obj_walker = iterator_hasvalue( page.get_objects([pdfium_c.FPDF_PAGEOBJ_IMAGE], max_depth=1) )
                 if have_images:
                     mask = np.zeros((bitmap.height, bitmap.width, 1), np.uint8)
-                    for obj in obj_searcher:
+                    for obj in obj_walker:
                         qpoints = np.array([posconv.to_bitmap(x, y) for x, y in obj.get_quad_points()], np.int32)
                         cv2.fillPoly(mask, [qpoints], 1)
                     dst_image = cv2.copyTo(src_image, mask=mask, dst=dst_image)

From d39dbf8cd8e493542522a0c378d01a72b4118ace Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 27 Nov 2024 01:16:20 +0100
Subject: [PATCH 131/140] fix awkward list default

having a mutable default parameter is dangerous / bad practice (although
it was not immediately harmful in this instance)
---
 src/pypdfium2/version.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/pypdfium2/version.py b/src/pypdfium2/version.py
index 3bca4a666..2be800a8f 100644
--- a/src/pypdfium2/version.py
+++ b/src/pypdfium2/version.py
@@ -32,12 +32,12 @@ def __repr__(self):
     def _craft_tag(self):
         return ".".join(str(v) for v in self.api_tag)
     
-    def _craft_desc(self, suffix=[]):
+    def _craft_desc(self, *suffixes):
         
         local_ver = []
         if self.n_commits > 0:
             local_ver += [str(self.n_commits), str(self.hash)]
-        local_ver += suffix
+        local_ver += suffixes
         
         desc = ""
         if local_ver:
@@ -56,8 +56,8 @@ def _hook(self):
         if self.beta is not None:
             self.tag += f"b{self.beta}"
         
-        suffix = ["dirty"] if self.dirty else []
-        self.desc = self._craft_desc(suffix)
+        suffixes = ["dirty"] if self.dirty else []
+        self.desc = self._craft_desc(*suffixes)
         if self.data_source != "git":
             self.desc += f":{self.data_source}"
         if self.is_editable:

From 2778473241cb457e6737f8cd4d7a1edf77661f98 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 27 Nov 2024 01:58:12 +0100
Subject: [PATCH 132/140] Avoid bool dicts

---
 src/pypdfium2/_helpers/bitmap.py      |  8 ++++----
 src/pypdfium2/_helpers/pageobjects.py | 12 ++++--------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/pypdfium2/_helpers/bitmap.py b/src/pypdfium2/_helpers/bitmap.py
index 597ac6947..403c68ce4 100644
--- a/src/pypdfium2/_helpers/bitmap.py
+++ b/src/pypdfium2/_helpers/bitmap.py
@@ -57,10 +57,10 @@ def __init__(self, raw, buffer, width, height, stride, format, rev_byteorder, ne
         self.format = format
         self.rev_byteorder = rev_byteorder
         self.n_channels = pdfium_i.BitmapTypeToNChannels[self.format]
-        self.mode = {
-            False: pdfium_i.BitmapTypeToStr,
-            True: pdfium_i.BitmapTypeToStrReverse,
-        }[self.rev_byteorder][self.format]
+        self.mode = (
+            pdfium_i.BitmapTypeToStrReverse if self.rev_byteorder else \
+            pdfium_i.BitmapTypeToStr
+        )[self.format]
         
         # slot to store arguments for PdfPosConv, set on page rendering
         self._pos_args = None
diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py
index 7565b1598..f59b8659b 100644
--- a/src/pypdfium2/_helpers/pageobjects.py
+++ b/src/pypdfium2/_helpers/pageobjects.py
@@ -235,10 +235,8 @@ def load_jpeg(self, source, pages=None, inline=False, autoclose=True):
             raise ValueError(f"Cannot load JPEG from {source} - not a file path or byte stream.")
         
         bufaccess, to_hold = pdfium_i.get_bufreader(buffer)
-        loader = {
-            False: pdfium_c.FPDFImageObj_LoadJpegFile,
-            True: pdfium_c.FPDFImageObj_LoadJpegFileInline,
-        }[inline]
+        loader = pdfium_c.FPDFImageObj_LoadJpegFileInline if inline else \
+                 pdfium_c.FPDFImageObj_LoadJpegFile
         
         c_pages, page_count = pdfium_i.pages_c_array(pages)
         ok = loader(c_pages, page_count, self, bufaccess)
@@ -306,10 +304,8 @@ def get_data(self, decode_simple=False):
         Returns:
             ctypes.Array: The data of the image stream (as :class:`~ctypes.c_ubyte` array).
         """
-        func = {
-            False: pdfium_c.FPDFImageObj_GetImageDataRaw,
-            True: pdfium_c.FPDFImageObj_GetImageDataDecoded,
-        }[decode_simple]
+        func = pdfium_c.FPDFImageObj_GetImageDataDecoded if decode_simple else \
+               pdfium_c.FPDFImageObj_GetImageDataRaw
         n_bytes = func(self, None, 0)
         buffer = (ctypes.c_ubyte * n_bytes)()
         func(self, buffer, n_bytes)

From 58f508af629c290ec62dbdafecd4100024864b6d Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 27 Nov 2024 02:02:55 +0100
Subject: [PATCH 133/140] fix awkward formatting w/ auto-wrap

alternatively, we could put the + on the beginning of the other line
but in this case it's easiest to just omit it
---
 setupsrc/pypdfium2_setup/autorelease.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setupsrc/pypdfium2_setup/autorelease.py b/setupsrc/pypdfium2_setup/autorelease.py
index 1f5525f48..979cdca39 100644
--- a/setupsrc/pypdfium2_setup/autorelease.py
+++ b/setupsrc/pypdfium2_setup/autorelease.py
@@ -190,7 +190,7 @@ def main():
         parsed_helpers = parse_git_tag()
         if new_helpers != parsed_helpers:
             print(
-                "Warning: Written and parsed helpers do not match. This should not happen in CI.\n" +
+                "Warning: Written and parsed helpers do not match. This should not happen in CI.\n"
                 f"In: {new_helpers}\n" + f"Out: {parsed_helpers}"
             )
     make_releasenotes(summary, record["pdfium"], new_pdfium, prev_tag, new_tag, c_updates)

From 11469fec6cea357d3f6a4241cc640884b5900e79 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 18 Dec 2024 20:23:04 +0100
Subject: [PATCH 134/140] add two FIXMEs

---
 src/pypdfium2/_library_scope.py | 1 +
 tests/test_misc.py              | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/pypdfium2/_library_scope.py b/src/pypdfium2/_library_scope.py
index d66daf21e..54c5cbb85 100644
--- a/src/pypdfium2/_library_scope.py
+++ b/src/pypdfium2/_library_scope.py
@@ -10,6 +10,7 @@
 def init_lib():
     assert not pdfium_i.LIBRARY_AVAILABLE
     if pdfium_i.DEBUG_AUTOCLOSE:
+        # FIXME never shown, because DEBUG_AUTOCLOSE can only be set on the caller side after pypdfium2 has been imported...
         print("Initialize PDFium (auto)", file=sys.stderr)
     
     # PDFium init API may change in the future: https://crbug.com/pdfium/1446
diff --git a/tests/test_misc.py b/tests/test_misc.py
index 739fe9e98..33e72c173 100644
--- a/tests/test_misc.py
+++ b/tests/test_misc.py
@@ -40,6 +40,7 @@ def _filter(prefix, skips=[], type=int):
 BitmapNsp = _filter("FPDFBitmap_", [pdfium_c.FPDFBitmap_Unknown])
 PageObjNsp = _filter("FPDF_PAGEOBJ_")
 ErrorMapping = pdfium_i.ErrorToStr
+# FIXME this will cause an erroneous test failure when using the reference bindings with a non-XFA build
 if "XFA" in PDFIUM_INFO.flags:
     ErrorMapping.update(pdfium_i.XFAErrorToStr)
 

From 379d9b5a949cc4bf0de2dd63636b765545b07dc7 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Wed, 18 Dec 2024 20:59:33 +0100
Subject: [PATCH 135/140] First steps towards android detection

---
 setupsrc/pypdfium2_setup/packaging_base.py | 34 +++++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py
index 887a6c6b1..481e7c0c9 100644
--- a/setupsrc/pypdfium2_setup/packaging_base.py
+++ b/setupsrc/pypdfium2_setup/packaging_base.py
@@ -305,8 +305,12 @@ def __init__(self):
         # If we are on Linux, check if we have glibc or musl
         self._libc_name, self._libc_ver = _get_libc_info()
         
-        # TODO consider cached property for platform and system
-        self.platform = self._get_platform()
+        # TODO consider cached property for platform and system?
+        try:
+            self.platform = self._get_platform()
+        except Exception as e:
+            self.platform = None
+            self._exc = e
         self.system = None
         if self.platform is not None:
             self.system = plat_to_system(self.platform)
@@ -320,19 +324,30 @@ def __repr__(self):
     def _is_plat(self, system, machine):
         return self._system_name.startswith(system) and self._machine_name.startswith(machine)
     
+    def _handle_linux_libc(self, archid):
+        if self._libc_name == "glibc":
+            return getattr(PlatNames, f"linux_{archid}")
+        elif self._libc_name == "musl":
+            return getattr(PlatNames, f"linux_musl_{archid}")
+        elif self._libc_name == "libc":
+            raise RuntimeError(f"Android {archid!r} prior to PEP 738 - not handled in pypdfium2 yet.")
+        else:
+            raise RuntimeError(f"Linux with unhandled libc {self._libc_name!r}.")
+    
     def _get_platform(self):
-        # some machine names are merely "qualified guesses", mistakes can't be fully excluded for platforms we don't have access to
         if self._is_plat("darwin", "x86_64"):
             return PlatNames.darwin_x64
         elif self._is_plat("darwin", "arm64"):
             return PlatNames.darwin_arm64
         elif self._is_plat("linux", "x86_64"):
-            return PlatNames.linux_x64 if self._libc_name != "musl" else PlatNames.linux_musl_x64
+            return self._handle_linux_libc("x64")
         elif self._is_plat("linux", "i686"):
-            return PlatNames.linux_x86 if self._libc_name != "musl" else PlatNames.linux_musl_x86
+            return self._handle_linux_libc("x86")
         elif self._is_plat("linux", "aarch64"):
-            return PlatNames.linux_arm64 if self._libc_name != "musl" else PlatNames.linux_musl_arm64
+            return self._handle_linux_libc("arm64")
         elif self._is_plat("linux", "armv7l"):
+            if self._libc_name != "glibc":
+                raise RuntimeError(f"armv7l: only glibc supported at this time, you have {self._libc_name!r}")  # no musl/android
             return PlatNames.linux_arm32
         elif self._is_plat("windows", "amd64"):
             return PlatNames.windows_x64
@@ -340,8 +355,10 @@ def _get_platform(self):
             return PlatNames.windows_arm64
         elif self._is_plat("windows", "x86"):
             return PlatNames.windows_x86
+        elif self._system_name.startswith("android"):
+            raise RuntimeError(f"Android {self._machine_name!r} with PEP 738 - not handled in pypdfium2 yet.")
         else:
-            return None
+            raise RuntimeError(f"Unhandled platform: {self!r}")
 
 Host = _host_platform()
 
@@ -608,7 +625,8 @@ def parse_pl_spec(pl_spec, with_prepare=True):
     if not pl_spec or pl_spec == "auto":
         pl_name = Host.platform
         if pl_name is None:
-            raise RuntimeError(f"No pre-built binaries available for {Host}. You may place custom binaries & bindings in data/sourcebuild and install with `{PlatSpec_EnvVar}=sourcebuild`.")
+            print(f"No pre-built binaries available for this host. You may place custom binaries & bindings in data/sourcebuild/ and install with `{PlatSpec_EnvVar}=sourcebuild`.", file=sys.stderr)
+            raise Host._exc
     elif hasattr(ExtPlats, pl_spec):
         pl_name = getattr(ExtPlats, pl_spec)
     elif hasattr(PlatNames, pl_spec):

From 4b90faa331fc7ec29fee75a64f7b44e2d7c4986f Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 19 Dec 2024 21:32:13 +0100
Subject: [PATCH 136/140] Update test expectation for toc_maxdepth

---
 tests/expectations/toc_maxdepth.txt | 37 +++++++++++++++--------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/tests/expectations/toc_maxdepth.txt b/tests/expectations/toc_maxdepth.txt
index beeacb932..889fe0075 100644
--- a/tests/expectations/toc_maxdepth.txt
+++ b/tests/expectations/toc_maxdepth.txt
@@ -1,20 +1,21 @@
-[+] 1.outline -> 1  # FitH [746.439]
-    [+] 1.1.outline -> 1  # FitH [700.878]
-        [+] 1.1.1.outline -> 1  # FitH [632.537]
-            [+] 1.1.1.1.outline -> 1  # FitH [632.946]
-                [+] 1.1.1.1.1.outline -> 1  # FitH [597.304]
-                    [+] 1.1.1.1.1.1outline -> 1  # FitH [632.946]
-                        [+] 1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                            [+] 1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                [+] 1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                    [+] 1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                        [+] 1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                            [+] 1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                                [+] 1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                                    [+] 1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-                                                        [+] 1.1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
-[+] 2.outline -> 2  # FitH [749.477]
-    [+] 2.1.outline -> 2  # FitH [699.36]
-        [+] 2.1.1.outline -> 2  # FitH [628.74]
+[+100] 1.outline -> 1  # FitH [746.439]
+    [+100] 1.1.outline -> 1  # FitH [700.878]
+        [+1] 1.1.1.outline -> 1  # FitH [632.537]
+            [+1] 1.1.1.1.outline -> 1  # FitH [632.946]
+                [+1] 1.1.1.1.1.outline -> 1  # FitH [597.304]
+                    [+1] 1.1.1.1.1.1outline -> 1  # FitH [632.946]
+                        [+1] 1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                            [+1] 1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                [+1] 1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                    [+1] 1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                        [+1] 1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                            [+1] 1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                                [+1] 1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                                    [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+                                                        [+1] 1.1.1.1.1.1.1.1.1.1.1.1.1.1.outline -> 1  # FitH [632.946]
+Maximum recursion depth 15 reached (subtree skipped).
+[+100] 2.outline -> 2  # FitH [749.477]
+    [+100] 2.1.outline -> 2  # FitH [699.36]
+        [+100] 2.1.1.outline -> 2  # FitH [628.74]
             [*] 2.1.1.1.outline -> 2  # FitH [583.179]
     [*] 2.2 outline -> 2  # FitH [515.218]

From f0409fc164b29871f3b5f3a835329e965e2aab88 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Thu, 19 Dec 2024 21:44:17 +0100
Subject: [PATCH 137/140] Clean up & tighten platform detection

---
 setupsrc/pypdfium2_setup/packaging_base.py | 51 +++++++++++-----------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py
index 481e7c0c9..8df8e2d9b 100644
--- a/setupsrc/pypdfium2_setup/packaging_base.py
+++ b/setupsrc/pypdfium2_setup/packaging_base.py
@@ -321,9 +321,6 @@ def __repr__(self):
             info += f", {self._libc_name} {self._libc_ver}"
         return f"<Host: {info}>"
     
-    def _is_plat(self, system, machine):
-        return self._system_name.startswith(system) and self._machine_name.startswith(machine)
-    
     def _handle_linux_libc(self, archid):
         if self._libc_name == "glibc":
             return getattr(PlatNames, f"linux_{archid}")
@@ -335,30 +332,32 @@ def _handle_linux_libc(self, archid):
             raise RuntimeError(f"Linux with unhandled libc {self._libc_name!r}.")
     
     def _get_platform(self):
-        if self._is_plat("darwin", "x86_64"):
-            return PlatNames.darwin_x64
-        elif self._is_plat("darwin", "arm64"):
-            return PlatNames.darwin_arm64
-        elif self._is_plat("linux", "x86_64"):
-            return self._handle_linux_libc("x64")
-        elif self._is_plat("linux", "i686"):
-            return self._handle_linux_libc("x86")
-        elif self._is_plat("linux", "aarch64"):
-            return self._handle_linux_libc("arm64")
-        elif self._is_plat("linux", "armv7l"):
-            if self._libc_name != "glibc":
-                raise RuntimeError(f"armv7l: only glibc supported at this time, you have {self._libc_name!r}")  # no musl/android
-            return PlatNames.linux_arm32
-        elif self._is_plat("windows", "amd64"):
-            return PlatNames.windows_x64
-        elif self._is_plat("windows", "arm64"):
-            return PlatNames.windows_arm64
-        elif self._is_plat("windows", "x86"):
-            return PlatNames.windows_x86
-        elif self._system_name.startswith("android"):
+        if self._system_name == "darwin":
+            if self._machine_name == "x86_64":
+                return PlatNames.darwin_x64
+            elif self._machine_name == "arm64":
+                return PlatNames.darwin_arm64
+        elif self._system_name == "linux":
+            if self._machine_name == "x86_64":
+                return self._handle_linux_libc("x64")
+            elif self._machine_name == "i686":
+                return self._handle_linux_libc("x86")
+            elif self._machine_name == "aarch64":
+                return self._handle_linux_libc("arm64")
+            elif self._machine_name == "armv7l":
+                if self._libc_name != "glibc":
+                    raise RuntimeError(f"armv7l: only glibc supported at this time, you have {self._libc_name!r}")  # no musl/android
+                return PlatNames.linux_arm32
+        elif self._system_name == "windows":
+            if self._machine_name == "amd64":
+                return PlatNames.windows_x64
+            elif self._machine_name == "x86":
+                return PlatNames.windows_x86
+            elif self._machine_name == "arm64":
+                return PlatNames.windows_arm64
+        elif self._system_name == "android":
             raise RuntimeError(f"Android {self._machine_name!r} with PEP 738 - not handled in pypdfium2 yet.")
-        else:
-            raise RuntimeError(f"Unhandled platform: {self!r}")
+        raise RuntimeError(f"Unhandled platform: {self!r}")
 
 Host = _host_platform()
 

From 8d87525d356f430f327a3ae28d470cfbbf2e13c9 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 27 Dec 2024 01:20:56 +0100
Subject: [PATCH 138/140] Build reference bindings without srcinfo

CC #335
---
 setupsrc/pypdfium2_setup/autorelease.py    | 2 +-
 setupsrc/pypdfium2_setup/packaging_base.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/setupsrc/pypdfium2_setup/autorelease.py b/setupsrc/pypdfium2_setup/autorelease.py
index 979cdca39..69a77a91d 100644
--- a/setupsrc/pypdfium2_setup/autorelease.py
+++ b/setupsrc/pypdfium2_setup/autorelease.py
@@ -23,7 +23,7 @@ def run_local(*args, **kws):
 
 def update_refbindings(version):
     RefBindingsFile.unlink()
-    build_pdfium_bindings(version, guard_symbols=True, flags=REFBINDINGS_FLAGS, allow_system_despite_libdirs=True)
+    build_pdfium_bindings(version, guard_symbols=True, flags=REFBINDINGS_FLAGS, allow_system_despite_libdirs=True, no_srcinfo=True)
     shutil.copyfile(DataDir_Bindings/BindingsFN, RefBindingsFile)
     assert RefBindingsFile.exists()
 
diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py
index 8df8e2d9b..7e4cbf437 100644
--- a/setupsrc/pypdfium2_setup/packaging_base.py
+++ b/setupsrc/pypdfium2_setup/packaging_base.py
@@ -444,7 +444,7 @@ def tmp_cwd_context(tmp_cwd):
         os.chdir(orig_cwd)
 
 
-def run_ctypesgen(target_dir, headers_dir, flags=[], guard_symbols=False, compile_lds=[], run_lds=["."], allow_system_despite_libdirs=False):
+def run_ctypesgen(target_dir, headers_dir, flags=[], compile_lds=[], run_lds=["."], allow_system_despite_libdirs=False, guard_symbols=False, no_srcinfo=False):
     # Import ctypesgen only in this function so it does not have to be available for other setup tasks
     import ctypesgen
     assert getattr(ctypesgen, "PYPDFIUM2_SPECIFIC", False), "pypdfium2 requires fork of ctypesgen"
@@ -465,6 +465,8 @@ def run_ctypesgen(target_dir, headers_dir, flags=[], guard_symbols=False, compil
     args += ["--no-macro-guards"]
     if not guard_symbols:
         args += ["--no-symbol-guards"]
+    if no_srcinfo:
+        args += ["--no-srcinfo"]
     
     # pre-processor - if not given, pypdfium2-ctypesgen will try to auto-select as available (gcc/clang)
     c_preproc = os.environ.get("CPP", None)

From bc0d92e192e1ebb0a3d10e6f820cf1edd6381269 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Fri, 27 Dec 2024 01:30:17 +0100
Subject: [PATCH 139/140] Fix perilous mutable defaults

I don't think there were any actual issues, but in general this is just
too risky.
---
 setupsrc/pypdfium2_setup/craft_packages.py | 2 +-
 setupsrc/pypdfium2_setup/emplace.py        | 2 +-
 setupsrc/pypdfium2_setup/packaging_base.py | 8 ++++----
 tests/test_misc.py                         | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/setupsrc/pypdfium2_setup/craft_packages.py b/setupsrc/pypdfium2_setup/craft_packages.py
index de42bc421..cebf48718 100644
--- a/setupsrc/pypdfium2_setup/craft_packages.py
+++ b/setupsrc/pypdfium2_setup/craft_packages.py
@@ -110,7 +110,7 @@ def main_pypi(args):
             clean_platfiles()
 
 
-def run_conda_build(recipe_dir, out_dir, args=[]):
+def run_conda_build(recipe_dir, out_dir, args=()):
     with TmpCommitCtx():
         run_cmd(["conda", "build", recipe_dir, "--output-folder", out_dir, *args], cwd=ProjectDir, env=os.environ)
 
diff --git a/setupsrc/pypdfium2_setup/emplace.py b/setupsrc/pypdfium2_setup/emplace.py
index fe15a5fcf..81bfae246 100644
--- a/setupsrc/pypdfium2_setup/emplace.py
+++ b/setupsrc/pypdfium2_setup/emplace.py
@@ -53,7 +53,7 @@ def prepare_setup(pl_name, pdfium_ver, use_v8):
     if pl_name == ExtPlats.system:
         # TODO add option for caller to pass in custom headers_dir, run_lds and flags? unfortunately it's not straightforward how to integrate this
         # also want to consider accepting a full version for offline setup
-        build_pdfium_bindings(pdfium_ver, flags=flags, guard_symbols=True, run_lds=[])
+        build_pdfium_bindings(pdfium_ver, flags=flags, guard_symbols=True, run_lds=())
         shutil.copyfile(DataDir_Bindings/BindingsFN, ModuleDir_Raw/BindingsFN)
         write_pdfium_info(ModuleDir_Raw, pdfium_ver, origin="system", flags=flags)
         return [BindingsFN, VersionFN]
diff --git a/setupsrc/pypdfium2_setup/packaging_base.py b/setupsrc/pypdfium2_setup/packaging_base.py
index 7e4cbf437..e518e3820 100644
--- a/setupsrc/pypdfium2_setup/packaging_base.py
+++ b/setupsrc/pypdfium2_setup/packaging_base.py
@@ -202,8 +202,8 @@ def write_json(fp, data, indent=2):
         return json.dump(data, buf, indent=indent)
 
 
-def write_pdfium_info(dir, build, origin, flags=[], n_commits=0, hash=None):
-    info = dict(**PdfiumVer.to_full(build)._asdict(), n_commits=n_commits, hash=hash, origin=origin, flags=flags)
+def write_pdfium_info(dir, build, origin, flags=(), n_commits=0, hash=None):
+    info = dict(**PdfiumVer.to_full(build)._asdict(), n_commits=n_commits, hash=hash, origin=origin, flags=list(flags))
     write_json(dir/VersionFN, info)
     return info
 
@@ -444,7 +444,7 @@ def tmp_cwd_context(tmp_cwd):
         os.chdir(orig_cwd)
 
 
-def run_ctypesgen(target_dir, headers_dir, flags=[], compile_lds=[], run_lds=["."], allow_system_despite_libdirs=False, guard_symbols=False, no_srcinfo=False):
+def run_ctypesgen(target_dir, headers_dir, flags=(), compile_lds=(), run_lds=(".", ), allow_system_despite_libdirs=False, guard_symbols=False, no_srcinfo=False):
     # Import ctypesgen only in this function so it does not have to be available for other setup tasks
     import ctypesgen
     assert getattr(ctypesgen, "PYPDFIUM2_SPECIFIC", False), "pypdfium2 requires fork of ctypesgen"
@@ -491,7 +491,7 @@ def run_ctypesgen(target_dir, headers_dir, flags=[], compile_lds=[], run_lds=[".
 
 
 def build_pdfium_bindings(version, headers_dir=None, **kwargs):
-    defaults = dict(flags=[], run_lds=["."], guard_symbols=False)
+    defaults = dict(flags=(), run_lds=(".", ), guard_symbols=False)
     for k, v in defaults.items():
         kwargs.setdefault(k, v)
     
diff --git a/tests/test_misc.py b/tests/test_misc.py
index 33e72c173..822edc80c 100644
--- a/tests/test_misc.py
+++ b/tests/test_misc.py
@@ -27,7 +27,7 @@ def test_color_tohex(color_in, rev_byteorder, exp_color):
     assert pdfium_c.FPDF_GetBValue(exp_color) == channels[3]
 
 
-def _filter(prefix, skips=[], type=int):
+def _filter(prefix, skips=(), type=int):
     items = []
     for attr in dir(pdfium_c):
         value = getattr(pdfium_c, attr)

From 820f5c53b4b572ccc3131067ecca7667b6a63017 Mon Sep 17 00:00:00 2001
From: geisserml <geisserml@gmail.com>
Date: Mon, 30 Dec 2024 21:10:53 +0100
Subject: [PATCH 140/140] Warn about --optimize-mode lcd with
 --invert-lightness

Unfortunately, colour post-processing destroys LCD optimization
---
 src/pypdfium2/_cli/render.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
index 03e9f9012..6d4567205 100644
--- a/src/pypdfium2/_cli/render.py
+++ b/src/pypdfium2/_cli/render.py
@@ -196,7 +196,7 @@ def attach(parser):
     postproc.add_argument(
         "--invert-lightness",
         action = "store_true",
-        help = "Invert lightness using the HLS color space (e.g. white<->black, dark_blue<->light_blue). The intent is to achieve a dark theme for documents with light background, while providing better visual results than classical color inversion or a flat pdfium color scheme.",
+        help = "Invert lightness using the HLS color space (e.g. white<->black, dark_blue<->light_blue). The intent is to achieve a dark theme for documents with light background, while providing better visual results than classical color inversion or a flat pdfium color scheme. However, note that --optimize-mode lcd is not recommendable when inverting lightness.",
     )
     postproc.add_argument(
         "--exclude-images",
@@ -393,6 +393,8 @@ def main(args):
         invert_lightness = args.invert_lightness,
         exclude_images = args.exclude_images,
     )
+    if args.invert_lightness and args.optimize_mode == "lcd":
+        logger.warning("LCD optimization clashes with lightness inversion, as post-processing colours defeats the idea of subpixel rendering.")
     
     # TODO dump all args except password?
     logger.info(f"{args.engine_cls.__name__}, Format: {args.format}, rev_byteorder: {args.rev_byteorder}, prefer_bgrx {args.prefer_bgrx}")