From 2d1c80573288b8f171e4253791a541f8a7a3ba1c Mon Sep 17 00:00:00 2001 From: geisserml Date: Thu, 4 Apr 2024 23:02:25 +0200 Subject: [PATCH] Work on `PdfImage.extract()` --- docs/devel/changelog_staging.md | 4 ++- src/pypdfium2/_cli/extract_images.py | 4 +-- src/pypdfium2/_helpers/pageobjects.py | 35 ++++++++++++--------------- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/docs/devel/changelog_staging.md b/docs/devel/changelog_staging.md index 03efeb76b..f7389f560 100644 --- a/docs/devel/changelog_staging.md +++ b/docs/devel/changelog_staging.md @@ -6,9 +6,11 @@ # Changelog for next release *API-breaking changes* -- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also, distinguish between `dest == None` and an empty dest. - Removed `PdfDocument.render()` (see deprecation rationale in v4.25 changelog). + Instead, use `PdfPage.render()` with a loop or process pool. - Removed `PdfBitmap.get_info()` and `PdfBitmapInfo`, which existed only on behalf of data transfer with `PdfDocument.render()`. +- `PdfDocument.get_toc()`: Replaced `PdfOutlineItem` namedtuple with method-oriented wrapper classes `PdfBookmark` and `PdfDest`, so callers may retrieve only the properties they actually need. This is closer to pdfium's original API and exposes the underlying raw objects. Provides signed count as-is rather than splitting in `n_kids` and `is_closed`. Also distinguishes between `dest == None` and an empty dest. +- Removed `fb_render` parameter from `PdfImage.extract()` because it does not fit in this API. If the image's rendered bitmap is desired, use `.get_bitmap(render=True)` in the first place. *Improvements and new features* - Added context manager support to `PdfDocument`, so it can be used in a `with`-statement, because opening from a file path binds a file descriptor, which should be released safely and as soon as possible, given OS limits on the number of open FDs. diff --git a/src/pypdfium2/_cli/extract_images.py b/src/pypdfium2/_cli/extract_images.py index 6fd94569e..6091d3489 100644 --- a/src/pypdfium2/_cli/extract_images.py +++ b/src/pypdfium2/_cli/extract_images.py @@ -37,7 +37,7 @@ def attach(parser): parser.add_argument( "--render", action = "store_true", - help = "Whether to get rendered bitmaps, taking masks and transform matrices into account. (Fallback if doing smart extraction.)", + help = "Whether to get rendered bitmaps, taking masks and transform matrices into account. (requires --use-bitmap, ignored with smart extraction)", ) @@ -71,7 +71,7 @@ def main(args): pil_image = image.get_bitmap(render=args.render).to_pil() pil_image.save( prefix.with_suffix("."+args.format) ) else: - image.extract(prefix, fb_format=args.format, fb_render=args.render) + image.extract(prefix, fb_format=args.format) except pdfium.PdfiumError: traceback.print_exc() image.close() diff --git a/src/pypdfium2/_helpers/pageobjects.py b/src/pypdfium2/_helpers/pageobjects.py index 2be708f1a..5f5796174 100644 --- a/src/pypdfium2/_helpers/pageobjects.py +++ b/src/pypdfium2/_helpers/pageobjects.py @@ -23,8 +23,7 @@ class PdfObject (pdfium_i.AutoCloseable): """ Page object helper class. - When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead, - depending on the object's :attr:`.type` (e. g. :class:`.PdfImage`). + When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead, depending on the object's :attr:`.type` (e. g. :class:`.PdfImage`). Attributes: raw (FPDF_PAGEOBJECT): @@ -121,8 +120,6 @@ def transform(self, matrix): pdfium_c.FPDFPageObj_Transform(self, *matrix.get()) -# In principle, we would like to move PdfImage to a separate file, but it's not that easy because of the two-fold connection with PdfObject, which would run us into a circular import. (However, what we could do is externalize the class under a different name and turn PdfImage into a wrapper which merely inherits from that class.) - class PdfImage (PdfObject): """ Image object helper class (specific kind of page object). @@ -141,7 +138,7 @@ def new(cls, pdf): Returns: PdfImage: Handle to a new, empty image. Note that position and size of the image are defined by its matrix, which defaults to the identity matrix. - This means that new images will appear as a tiny square of 1x1 units on the bottom left corner of the page. + This means that new images will appear as a tiny square of 1x1 canvas units on the bottom left corner of the page. Use :class:`.PdfMatrix` and :meth:`.set_matrix` to adjust size and position. """ raw_img = pdfium_c.FPDFPageObj_NewImageObj(pdf) @@ -155,7 +152,7 @@ def get_metadata(self): Note: * The DPI values signify the resolution of the image on the PDF page, not the DPI metadata embedded in the image file. - * Due to issues in PDFium, this function can be slow. If you only need image size, prefer the faster :meth:`.get_size` instead. + * Due to issues in pdfium, this function might be slow on some kinds of images. If you only need size, prefer :meth:`.get_size` instead. Returns: FPDF_IMAGEOBJ_METADATA: Image metadata structure @@ -310,23 +307,21 @@ def get_filters(self, skip_simple=False): def extract(self, dest, *args, **kwargs): - # TODO rewrite/simplify docstring """ - Extract the image into an independently usable file or byte buffer. - Where possible within PDFium's limited public API, it will be attempted to transfer the image data directly, - avoiding an unnecessary layer of decoding and re-encoding. - Otherwise, the fully decoded data will be retrieved and (re-)encoded using :mod:`PIL`. + Extract the image into an independently usable file or byte buffer, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits. + + Only DCTDecode (JPEG) and JPXDecode (JPEG 2000) images can be extracted directly. + Otherwise, the pixel data is decoded, and re-encoded using :mod:`PIL`. + For images with simple filters only, ``get_data(decode_simple=True)`` is used for decoding to preserve higher bit depth or special color formats not supported by FPDF_BITMAP. + For images with complex filters, we have to resort to :meth:`.get_bitmap`, which can be a lossy operation. - As PDFium does not expose all required information, only DCTDecode (JPEG) and JPXDecode (JPEG 2000) images can be extracted directly. - For images with complex filters, the bitmap data is used. Otherwise, ``get_data(decode_simple=True)`` is used, which avoids lossy conversion for images whose bit depth or colour format is not supported by PDFium's bitmap implementation. + Note, this method ignores alpha masks and some other data stored separately from the main data stream (e.g. BlackIsWhite), which might lead to incorrect representation of the image. Parameters: dest (str | io.BytesIO): File prefix or byte buffer to which the image shall be written. fb_format (str): The image format to use in case it is necessary to (re-)encode the data. - fb_render (bool): - Whether the image should be rendered if falling back to bitmap-based extraction. """ # https://crbug.com/pdfium/1930 @@ -367,15 +362,13 @@ def _get_pil_mode(colorspace, bpp): return None -def _extract_smart(image_obj, fb_format=None, fb_render=False): - - # FIXME somewhat hard to read... +def _extract_smart(image_obj, fb_format=None): try: data, info = _extract_direct(image_obj) except ImageNotExtractableError: # TODO? log reason why the image cannot be extracted directly - pil_image = image_obj.get_bitmap(render=fb_render).to_pil() + pil_image = image_obj.get_bitmap(render=False).to_pil() else: pil_image = None format = info.format @@ -389,7 +382,9 @@ def _extract_smart(image_obj, fb_format=None, fb_render=False): ) if pil_image: - format = fb_format if fb_format else "tiff" if pil_image.mode == "CMYK" else "png" + format = fb_format + if not format: + format = {"CMYK": "tiff"}.get(pil_image.mode, "png") buffer = yield format pil_image.save(buffer, format=format) if pil_image else buffer.write(data)