Skip to content

Commit

Permalink
Avoid -ve color values in text-span dicts and add alpha value.
Browse files Browse the repository at this point in the history
Color value from MuPDF now contains alpha value in top 8 bits so appeared -ve
if alpha sets top bit.

Also added alpha value to the dict.

Addresses #4139.
  • Loading branch information
julian-smith-artifex-com committed Jan 20, 2025
1 parent 0446c42 commit bbae37e
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 17 deletions.
9 changes: 8 additions & 1 deletion docs/textpage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -292,15 +292,22 @@ descender descender of the font *(float)*
size font size *(float)*
flags font characteristics *(int)*
char_flags char characteristics *(int)*
color text color in sRGB format *(int)*
color text color in sRGB format 0xRRGGBB *(int)*.
alpha text opacity 0..255 *(int)*.
text (only for :meth:`extractDICT`) text *(str)*
chars (only for :meth:`extractRAWDICT`) *list* of character dictionaries
=============== =====================================================================

|history_begin|

*(New in version 1.25.3.0):* Added *"alpha"* item.

*(New in version 1.16.0):* *"color"* is the text color encoded in sRGB (int) format, e.g. 0xFF0000 for red. There are functions for converting this integer back to formats (r, g, b) (PDF with float values from 0 to 1) :meth:`sRGB_to_pdf`, or (R, G, B), :meth:`sRGB_to_rgb` (with integer values from 0 to 255).

*(New in v1.18.5):* *"ascender"* and *"descender"* are font properties, provided relative to :data:`fontsize` 1. Note that descender is a negative value. The following picture shows the relationship to other values and properties.

|history_end|

.. image:: images/img-asc-desc.*
:scale: 60

Expand Down
14 changes: 8 additions & 6 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16355,7 +16355,7 @@ def __init__(self, rhs=None):
if mupdf_version_tuple >= (1, 25, 2):
self.char_flags = rhs.char_flags
self.font = rhs.font
self.color = rhs.color
self.argb = rhs.argb
self.asc = rhs.asc
self.desc = rhs.desc
else:
Expand All @@ -16364,7 +16364,7 @@ def __init__(self, rhs=None):
if mupdf_version_tuple >= (1, 25, 2):
self.char_flags = -1
self.font = ''
self.color = -1
self.argb = -1
self.asc = 0
self.desc = 0
def __str__(self):
Expand Down Expand Up @@ -16402,9 +16402,9 @@ def __str__(self):
style.char_flags = ch.m_internal.flags
style.font = JM_font_name(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
if mupdf_version_tuple >= (1, 25):
style.color = ch.m_internal.argb
style.argb = ch.m_internal.argb
else:
style.color = ch.m_internal.color
style.argb = ch.m_internal.color
style.asc = JM_font_ascender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
style.desc = JM_font_descender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))

Expand All @@ -16414,7 +16414,7 @@ def __str__(self):
and (style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC)
!= (old_style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC)
)
or style.color != old_style.color
or style.argb != old_style.argb
or style.font != old_style.font
):
if old_style.size >= 0:
Expand Down Expand Up @@ -16446,7 +16446,9 @@ def __str__(self):
if mupdf_version_tuple >= (1, 25, 2):
span[dictkey_char_flags] = style.char_flags
span[dictkey_font] = JM_EscapeStrFromStr(style.font)
span[dictkey_color] = style.color
span[dictkey_color] = style.argb & 0xffffff
if mupdf_version_tuple >= (1, 25, 0):
span['alpha'] = style.argb >> 24
span["ascender"] = asc
span["descender"] = desc

Expand Down
22 changes: 12 additions & 10 deletions src/extra.i
Original file line number Diff line number Diff line change
Expand Up @@ -1920,7 +1920,6 @@ static void jm_trace_text_span(
<< " fsize=" << fsize
<< " linewidth=" << linewidth
<< "\n";

dict_setitem_drop(span_dict, dictkey_color, Py_BuildValue("fff", rgb[0], rgb[1], rgb[2]));
dict_setitem_drop(span_dict, dictkey_size, PyFloat_FromDouble(fsize));
dict_setitemstr_drop(span_dict, "opacity", PyFloat_FromDouble((double) alpha));
Expand Down Expand Up @@ -3040,7 +3039,7 @@ mupdf::FzRect JM_make_spanlist(
struct char_style
{
float size = -1;
int flags = -1;
unsigned flags = 0;

#if MUPDF_VERSION_GE(1, 25, 2)
/* From mupdf:include/mupdf/fitz/structured-text.h:fz_stext_char::flags, which
Expand All @@ -3052,11 +3051,11 @@ mupdf::FzRect JM_make_spanlist(
FZ_STEXT_STROKED = 32,
FZ_STEXT_CLIPPED = 64
*/
int char_flags;
unsigned char_flags = 0;
#endif

const char *font = "";
unsigned int color = -1;
unsigned argb = 0;
float asc = 0;
float desc = 0;
};
Expand Down Expand Up @@ -3086,9 +3085,9 @@ mupdf::FzRect JM_make_spanlist(
#endif
style.font = JM_font_name(ch.m_internal->font);
#if MUPDF_VERSION_GE(1, 25, 0)
style.color = ch.m_internal->argb;
style.argb = ch.m_internal->argb;
#else
style.color = ch.m_internal->color;
style.argb = ch.m_internal->color;
#endif
style.asc = JM_font_ascender(ch.m_internal->font);
style.desc = JM_font_descender(ch.m_internal->font);
Expand All @@ -3099,7 +3098,7 @@ mupdf::FzRect JM_make_spanlist(
#if MUPDF_VERSION_GE(1, 25, 2)
|| (style.char_flags & ~FZ_STEXT_SYNTHETIC) != (old_style.char_flags & ~FZ_STEXT_SYNTHETIC)
#endif
|| style.color != old_style.color
|| style.argb != old_style.argb
|| strcmp(style.font, old_style.font) != 0
)
{
Expand Down Expand Up @@ -3135,12 +3134,15 @@ mupdf::FzRect JM_make_spanlist(
}

DICT_SETITEM_DROP(span, dictkey_size, Py_BuildValue("f", style.size));
DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("i", style.flags));
DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("I", style.flags));
#if MUPDF_VERSION_GE(1, 25, 2)
DICT_SETITEM_DROP(span, dictkey_char_flags, Py_BuildValue("i", style.char_flags));
DICT_SETITEM_DROP(span, dictkey_char_flags, Py_BuildValue("I", style.char_flags));
#endif
DICT_SETITEM_DROP(span, dictkey_font, JM_EscapeStrFromStr(style.font));
DICT_SETITEM_DROP(span, dictkey_color, Py_BuildValue("i", style.color));
DICT_SETITEM_DROP(span, dictkey_color, Py_BuildValue("I", style.argb & 0xffffff));
#if MUPDF_VERSION_GE(1, 25, 0)
DICT_SETITEMSTR_DROP(span, "alpha", Py_BuildValue("I", style.argb >> 24));
#endif
DICT_SETITEMSTR_DROP(span, "ascender", Py_BuildValue("f", asc));
DICT_SETITEMSTR_DROP(span, "descender", Py_BuildValue("f", desc));

Expand Down
Binary file added tests/resources/test_4139.pdf
Binary file not shown.
25 changes: 25 additions & 0 deletions tests/test_textextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,3 +422,28 @@ def test_4147():
else:
print(f' span: {span["flags"]=:#x}')
assert 'char_flags' not in span


def test_4139():
path = os.path.normpath(f'{__file__}/../../tests/resources/test_4139.pdf')
flags = (0
| pymupdf.TEXT_PRESERVE_IMAGES
| pymupdf.TEXT_PRESERVE_WHITESPACE
| pymupdf.TEXT_CID_FOR_UNKNOWN_UNICODE
)
with pymupdf.open(path) as document:
page = document[0]
dicts = page.get_text('dict', flags=flags, sort=True)
seen = set()
for b_ctr, b in enumerate(dicts['blocks']):
for l_ctr, l in enumerate(b.get('lines', [])):
for s_ctr, s in enumerate(l['spans']):
color = s.get('color')
if color is not None and color not in seen:
seen.add(color)
print(f"B{b_ctr}.L{l_ctr}.S{s_ctr}: {color=} {hex(color)=} {s=}")
assert color == 0, f'{s=}'
if pymupdf.mupdf_version_tuple >= (1, 25):
assert s['alpha'] == 255
else:
assert not 'alpha' in s

0 comments on commit bbae37e

Please sign in to comment.