From 2488a448b3d5c819af8200b91293cafc70c8dbb2 Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Tue, 7 Jan 2025 08:48:15 -0400 Subject: [PATCH] Corrections for 4186 Extracting JPEG-CMYK images consistently need inverting the colors. We have taken this opportunity to reuse as much as possible the creation of the image dictionaries in 'Document.extract_image' and the image block in the Python version of text extraction. --- src/__init__.py | 152 ++++++++++++++++-------------------------------- src/extra.i | 20 +++++-- 2 files changed, 65 insertions(+), 107 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index 5e949dd81..d31f3c62c 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -4196,8 +4196,7 @@ def extract_image(self, xref): raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) - img_type = 0 - smask = 0 + if not _INRANGE(xref, 1, mupdf.pdf_xref_len(pdf)-1): raise ValueError( MSG_BAD_XREF) @@ -4210,65 +4209,15 @@ def extract_image(self, xref): o = mupdf.pdf_dict_geta(obj, PDF_NAME('SMask'), PDF_NAME('Mask')) if o.m_internal: smask = mupdf.pdf_to_num(o) - - if mupdf.pdf_is_jpx_image(obj): - img_type = mupdf.FZ_IMAGE_JPX - res = mupdf.pdf_load_stream(obj) - ext = "jpx" - if JM_is_jbig2_image(obj): - img_type = mupdf.FZ_IMAGE_JBIG2 - res = mupdf.pdf_load_stream(obj) - ext = "jb2" - res = mupdf.pdf_load_raw_stream(obj) - if img_type == mupdf.FZ_IMAGE_UNKNOWN: - res = mupdf.pdf_load_raw_stream(obj) - _, c = mupdf.fz_buffer_storage(res) - #log( '{=_ c}') - img_type = mupdf.fz_recognize_image_format(c) - ext = JM_image_extension(img_type) - if img_type == mupdf.FZ_IMAGE_UNKNOWN: - res = None - img = mupdf.pdf_load_image(pdf, obj) - ll_cbuf = mupdf.ll_fz_compressed_image_buffer(img.m_internal) - if (ll_cbuf - and ll_cbuf.params.type not in ( - mupdf.FZ_IMAGE_RAW, - mupdf.FZ_IMAGE_FAX, - mupdf.FZ_IMAGE_FLATE, - mupdf.FZ_IMAGE_LZW, - mupdf.FZ_IMAGE_RLD, - ) - ): - img_type = ll_cbuf.params.type - ext = JM_image_extension(img_type) - res = mupdf.FzBuffer(mupdf.ll_fz_keep_buffer(ll_cbuf.buffer)) - else: - res = mupdf.fz_new_buffer_from_image_as_png( - img, - mupdf.FzColorParams(mupdf.fz_default_color_params), - ) - ext = "png" else: - img = mupdf.fz_new_image_from_buffer(res) - - xres, yres = mupdf.fz_image_resolution(img) - width = img.w() - height = img.h() - colorspace = img.n() - bpc = img.bpc() - cs_name = mupdf.fz_colorspace_name(img.colorspace()) + smask = 0 + # load the image + img = mupdf.pdf_load_image(pdf, obj) rc = dict() - rc[ dictkey_ext] = ext - rc[ dictkey_smask] = smask - rc[ dictkey_width] = width - rc[ dictkey_height] = height - rc[ dictkey_colorspace] = colorspace - rc[ dictkey_bpc] = bpc - rc[ dictkey_xres] = xres - rc[ dictkey_yres] = yres - rc[ dictkey_cs_name] = cs_name - rc[ dictkey_image] = JM_BinFromBuffer(res) + _make_image_dict(img, rc) + rc[dictkey_smask] = smask + rc[dictkey_cs_name] = mupdf.fz_colorspace_name(img.colorspace()) return rc def ez_save( @@ -16323,19 +16272,6 @@ def JM_irect_from_py(r): f[i] = FZ_MAX_INF_RECT return mupdf.fz_make_irect(f[0], f[1], f[2], f[3]) - -def JM_is_jbig2_image(dict_): - # fixme: should we remove this function? - return 0 - #filter_ = pdf_dict_get(ctx, dict_, PDF_NAME(Filter)); - #if (pdf_name_eq(ctx, filter_, PDF_NAME(JBIG2Decode))) - # return 1; - #n = pdf_array_len(ctx, filter_); - #for (i = 0; i < n; i++) - # if (pdf_name_eq(ctx, pdf_array_get(ctx, filter_, i), PDF_NAME(JBIG2Decode))) - # return 1; - #return 0; - def JM_listbox_value( annot): ''' ListBox retrieve value @@ -16533,38 +16469,52 @@ def __str__(self): line_dict[dictkey_spans] = span_list return line_rect +def _make_image_dict(img, img_dict): + """Populate a dictionary with information extracted from a given image. -def JM_make_image_block(block, block_dict): - image = block.i_image() - n = mupdf.fz_colorspace_n(image.colorspace()) - w = image.w() - h = image.h() - type_ = mupdf.FZ_IMAGE_UNKNOWN - # fz_compressed_image_buffer() is not available because - # `fz_compressed_buffer` is not copyable. - ll_fz_compressed_buffer = mupdf.ll_fz_compressed_image_buffer(image.m_internal) - if ll_fz_compressed_buffer: - type_ = ll_fz_compressed_buffer.params.type - if type_ < mupdf.FZ_IMAGE_BMP or type_ == mupdf.FZ_IMAGE_JBIG2: - type_ = mupdf.FZ_IMAGE_UNKNOWN - bytes_ = None - if ll_fz_compressed_buffer and type_ != mupdf.FZ_IMAGE_UNKNOWN: - buf = mupdf.FzBuffer( mupdf.ll_fz_keep_buffer( ll_fz_compressed_buffer.buffer)) - ext = JM_image_extension(type_) - else: - buf = mupdf.fz_new_buffer_from_image_as_png(image, mupdf.FzColorParams()) + Used by 'Document.extract_image' and by 'JM_make_image_block'. + Both of these functions will add some more specific information. + """ + img_type = img.fz_compressed_image_type() + ext = JM_image_extension(img_type) + + # compressed image buffer if present, else None + ll_cbuf = mupdf.ll_fz_compressed_image_buffer(img.m_internal) + + if (0 + or not ll_cbuf + or img_type in (mupdf.FZ_IMAGE_JBIG2, mupdf.FZ_IMAGE_UNKNOWN) + or img_type < mupdf.FZ_IMAGE_BMP + ): + # not an image with a compressed buffer: convert to PNG + res = mupdf.fz_new_buffer_from_image_as_png( + img, + mupdf.FzColorParams(mupdf.fz_default_color_params), + ) ext = "png" - bytes_ = JM_BinFromBuffer(buf) - block_dict[ dictkey_width] = w - block_dict[ dictkey_height] = h - block_dict[ dictkey_ext] = ext - block_dict[ dictkey_colorspace] = n - block_dict[ dictkey_xres] = image.xres() - block_dict[ dictkey_yres] = image.yres() - block_dict[ dictkey_bpc] = image.bpc() - block_dict[ dictkey_matrix] = JM_py_from_matrix(block.i_transform()) - block_dict[ dictkey_size] = len(bytes_) - block_dict[ dictkey_image] = bytes_ + elif ext == "jpeg" and img.n() == 4: + # JPEG with CMYK: invert colors + res = mupdf.fz_new_buffer_from_image_as_jpeg( + img, mupdf.FzColorParams(mupdf.fz_default_color_params), 95, 1) + else: + # copy the compressed buffer + res = mupdf.FzBuffer(mupdf.ll_fz_keep_buffer(ll_cbuf.buffer)) + + bytes_ = JM_BinFromBuffer(res) + img_dict[dictkey_width] = img.w() + img_dict[dictkey_height] = img.h() + img_dict[dictkey_ext] = ext + img_dict[dictkey_colorspace] = img.n() + img_dict[dictkey_xres] = img.xres() + img_dict[dictkey_yres] = img.yres() + img_dict[dictkey_bpc] = img.bpc() + img_dict[dictkey_size] = len(bytes_) + img_dict[dictkey_image] = bytes_ + +def JM_make_image_block(block, block_dict): + img = block.i_image() + _make_image_dict(img, block_dict) + block_dict[dictkey_matrix] = JM_py_from_matrix(block.i_transform()) def JM_make_text_block(block, block_dict, raw, buff, tp_rect): diff --git a/src/extra.i b/src/extra.i index 00b67ffa1..99572eab6 100644 --- a/src/extra.i +++ b/src/extra.i @@ -3511,22 +3511,30 @@ void JM_make_image_block(fz_stext_block *block, PyObject *block_dict) int n = fz_colorspace_n(ctx, image->colorspace); int w = image->w; int h = image->h; - const char *ext = NULL; + const char *ext = ""; int type = FZ_IMAGE_UNKNOWN; - if (buffer) + if (buffer) { type = buffer->params.type; + ext = JM_image_extension(type); + } if (type < FZ_IMAGE_BMP || type == FZ_IMAGE_JBIG2) type = FZ_IMAGE_UNKNOWN; PyObject *bytes = NULL; fz_var(bytes); fz_try(ctx) { - if (buffer && type != FZ_IMAGE_UNKNOWN) { - buf = buffer->buffer; - ext = JM_image_extension(type); - } else { + if (!buffer || type == FZ_IMAGE_UNKNOWN) + { buf = freebuf = fz_new_buffer_from_image_as_png(ctx, image, fz_default_color_params); ext = "png"; } + else if (n == 4 && strcmp(ext, "jpeg") == 0) // JPEG CMYK needs another step + { + buf = freebuf = fz_new_buffer_from_image_as_jpeg(ctx, image, fz_default_color_params, 95, 1); + } + else + { + buf = buffer->buffer; + } bytes = JM_BinFromBuffer(buf); } fz_always(ctx) {