Skip to content

Commit

Permalink
tests/: support mupdf-1.25.2's new fz_stext_char flags.
Browse files Browse the repository at this point in the history
Add new `char_flags` member to span dictionary if mupdf >= 1.25.2, containing
extra low-level information.

For example allows detection of invisible text.

Updated docs/textpage.rst.

Improved tests of mupdf version in src/extra.i.

Added tests/test_textextract.py:test_4147() which checks that we can detect
when text is invisible.
  • Loading branch information
julian-smith-artifex-com committed Jan 8, 2025
1 parent 1b4e3a7 commit 734dbe5
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 24 deletions.
16 changes: 16 additions & 0 deletions docs/textpage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ ascender ascender of the font *(float)*
descender descender of the font *(float)*
size font size *(float)*
flags font characteristics *(int)*
char_flags char characteristics *(int)*
color text color in sRGB format *(int)*
text (only for :meth:`extractDICT`) text *(str)*
chars (only for :meth:`extractRAWDICT`) *list* of character dictionaries
Expand Down Expand Up @@ -335,6 +336,21 @@ Test these characteristics like so:

Bits 1 thru 4 are font properties, i.e. encoded in the font program. Please note, that this information is not necessarily correct or complete: fonts quite often contain wrong data here.

*"char_flags"* is an integer, which represents extra character properties:

* bit 0: strikeout.
* bit 1: underline.
* bit 2: synthetic.
* bit 3: filled.
* bit 4: stroked.
* bit 5: clipped.

For example if not filled and not stroked (`if not (char_flags & 2**3 & 2**4):
...`) then the text will be invisible.

(`char_flags` is new in v1.25.2.)


Character Dictionary for :meth:`extractRAWDICT`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
32 changes: 29 additions & 3 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13619,6 +13619,7 @@ class EmptyFileError(FileDataError):
dictkey_filename = "filename"
dictkey_fill = "fill"
dictkey_flags = "flags"
dictkey_char_flags = "char_flags"
dictkey_font = "font"
dictkey_glyph = "glyph"
dictkey_height = "height"
Expand Down Expand Up @@ -14669,7 +14670,9 @@ def JM_char_bbox(line, ch):


def JM_char_font_flags(font, line, ch):
flags = detect_super_script(line, ch)
flags = 0
if line and ch:
flags += detect_super_script(line, ch)
flags += mupdf.fz_font_is_italic(font) * TEXT_FONT_ITALIC
flags += mupdf.fz_font_is_serif(font) * TEXT_FONT_SERIFED
flags += mupdf.fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED
Expand Down Expand Up @@ -16391,19 +16394,27 @@ def __init__(self, rhs=None):
if rhs:
self.size = rhs.size
self.flags = rhs.flags
if mupdf_version_tuple >= (1, 25, 2):
self.char_flags = rhs.char_flags
self.font = rhs.font
self.color = rhs.color
self.asc = rhs.asc
self.desc = rhs.desc
else:
self.size = -1
self.flags = -1
if mupdf_version_tuple >= (1, 25, 2):
self.char_flags = -1
self.font = ''
self.color = -1
self.asc = 0
self.desc = 0
def __str__(self):
return f'{self.size} {self.flags} {self.font} {self.color} {self.asc} {self.desc}'
ret = f'{self.size} {self.flags}'
if mupdf_version_tuple >= (1, 25, 2):
ret += f' {self.char_flags}'
ret += f' {self.font} {self.color} {self.asc} {self.desc}'
return ret

old_style = char_style()
style = char_style()
Expand All @@ -16418,10 +16429,19 @@ def __str__(self):
):
continue

# Info from:
# detect_super_script()
# fz_font_is_italic()
# fz_font_is_serif()
# fz_font_is_monospaced()
# fz_font_is_bold()

flags = JM_char_font_flags(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)), line, ch)
origin = mupdf.FzPoint(ch.m_internal.origin)
style.size = ch.m_internal.size
style.flags = flags
if mupdf_version_tuple >= (1, 25, 2):
style.char_flags = ch.m_internal.flags
style.font = JM_font_name(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)))
if mupdf_version_tuple >= (1, 25):
style.color = ch.m_internal.argb
Expand All @@ -16432,6 +16452,10 @@ def __str__(self):

if (style.size != old_style.size
or style.flags != old_style.flags
or (mupdf_version_tuple >= (1, 25, 2)
and (style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC)
!= (old_style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC)
)
or style.color != old_style.color
or style.font != old_style.font
):
Expand Down Expand Up @@ -16461,6 +16485,8 @@ def __str__(self):

span[dictkey_size] = style.size
span[dictkey_flags] = style.flags
if mupdf_version_tuple >= (1, 25, 2):
span[dictkey_char_flags] = style.char_flags
span[dictkey_font] = JM_EscapeStrFromStr(style.font)
span[dictkey_color] = style.color
span["ascender"] = asc
Expand Down Expand Up @@ -18696,7 +18722,7 @@ def jm_trace_text_span(dev, span, type_, ctm, colorspace, color, alpha, seqno):
chars = tuple(chars)

if not space_adv:
if not mono:
if not (fflags & TEXT_FONT_MONOSPACED):
c, out_font = mupdf.fz_encode_character_with_fallback( span.font(), 32, 0, 0)
space_adv = mupdf.fz_advance_glyph(
span.font(),
Expand Down
86 changes: 65 additions & 21 deletions src/extra.i
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ otherwise compilation can fail because free() and malloc() are not declared. */
dictkey_filename = PyUnicode_InternFromString("filename");
dictkey_fill = PyUnicode_InternFromString("fill");
dictkey_flags = PyUnicode_InternFromString("flags");
dictkey_char_flags = PyUnicode_InternFromString("char_flags"); /* Only used with mupdf >= 1.25.2. */
dictkey_font = PyUnicode_InternFromString("font");
dictkey_glyph = PyUnicode_InternFromString("glyph");
dictkey_height = PyUnicode_InternFromString("height");
Expand Down Expand Up @@ -103,6 +104,14 @@ catch(...) {
#include <float.h>


#define MAKE_MUPDF_VERSION_INT(major, minor, patch) ((major << 16) + (minor << 8) + (patch << 0))

#define MUPDF_VERSION_INT MAKE_MUPDF_VERSION_INT(FZ_VERSION_MAJOR, FZ_VERSION_MINOR, FZ_VERSION_PATCH)

#define MUPDF_VERSION_GE(major, minor, patch) \
MUPDF_VERSION_INT >= MAKE_MUPDF_VERSION_INT(major, minor, patch)


/* Returns equivalent of `repr(x)`. */
static std::string repr(PyObject* x)
{
Expand Down Expand Up @@ -837,6 +846,7 @@ PyObject* dictkey_ext = NULL;
PyObject* dictkey_filename = NULL;
PyObject* dictkey_fill = NULL;
PyObject* dictkey_flags = NULL;
PyObject* dictkey_char_flags = NULL;
PyObject* dictkey_font = NULL;
PyObject* dictkey_glyph = NULL;
PyObject* dictkey_height = NULL;
Expand Down Expand Up @@ -1712,6 +1722,29 @@ static const char* JM_font_name(fz_font* font)
return s + 1;
}

static int detect_super_script(fz_stext_line *line, fz_stext_char *ch)
{
if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
{
return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
}
return 0;
}

static int JM_char_font_flags(fz_font *font, fz_stext_line *line, fz_stext_char *ch)
{
int flags = 0;
if (line && ch)
{
flags += detect_super_script(line, ch) * TEXT_FONT_SUPERSCRIPT;
}
flags += mupdf::ll_fz_font_is_italic(font) * TEXT_FONT_ITALIC;
flags += mupdf::ll_fz_font_is_serif(font) * TEXT_FONT_SERIFED;
flags += mupdf::ll_fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED;
flags += mupdf::ll_fz_font_is_bold(font) * TEXT_FONT_BOLD;
return flags;
}

static void jm_trace_text_span(
jm_tracedraw_device* dev,
fz_text_span* span,
Expand Down Expand Up @@ -1827,7 +1860,7 @@ static void jm_trace_text_span(
}
if (!space_adv)
{
if (!mono)
if (!(fflags & TEXT_FONT_MONOSPACED))
{
fz_font* out_font = nullptr;
space_adv = mupdf::ll_fz_advance_glyph(
Expand Down Expand Up @@ -2957,25 +2990,6 @@ PyObject* get_cdrawings(mupdf::FzPage& page, PyObject *extended=NULL, PyObject *
}


static int detect_super_script(fz_stext_line *line, fz_stext_char *ch)
{
if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
{
return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
}
return 0;
}

static int JM_char_font_flags(fz_font *font, fz_stext_line *line, fz_stext_char *ch)
{
int flags = detect_super_script(line, ch);
flags += mupdf::ll_fz_font_is_italic(font) * TEXT_FONT_ITALIC;
flags += mupdf::ll_fz_font_is_serif(font) * TEXT_FONT_SERIFED;
flags += mupdf::ll_fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED;
flags += mupdf::ll_fz_font_is_bold(font) * TEXT_FONT_BOLD;
return flags;
}

//---------------------------------------------------------------------------
// APPEND non-ascii runes in unicode escape format to fz_buffer
//---------------------------------------------------------------------------
Expand Down Expand Up @@ -3027,6 +3041,20 @@ mupdf::FzRect JM_make_spanlist(
{
float size = -1;
int flags = -1;

#if MUPDF_VERSION_GE(1, 25, 2)
/* From mupdf:include/mupdf/fitz/structured-text.h:fz_stext_char::flags, which
uses anonymous enum values:
FZ_STEXT_STRIKEOUT = 1,
FZ_STEXT_UNDERLINE = 2,
FZ_STEXT_SYNTHETIC = 4,
FZ_STEXT_FILLED = 16,
FZ_STEXT_STROKED = 32,
FZ_STEXT_CLIPPED = 64
*/
int char_flags;
#endif

const char *font = "";
unsigned int color = -1;
float asc = 0;
Expand All @@ -3042,12 +3070,22 @@ mupdf::FzRect JM_make_spanlist(
{
continue;
}
/* Info from:
detect_super_script()
fz_font_is_italic()
fz_font_is_serif()
fz_font_is_monospaced()
fz_font_is_bold()
*/
int flags = JM_char_font_flags( ch.m_internal->font, line.m_internal, ch.m_internal);
fz_point origin = ch.m_internal->origin;
style.size = ch.m_internal->size;
style.flags = flags;
#if MUPDF_VERSION_GE(1, 25, 2)
style.char_flags = ch.m_internal->flags;
#endif
style.font = JM_font_name(ch.m_internal->font);
#if (FZ_VERSION_MAJOR > 1 || (FZ_VERSION_MAJOR == 1 && FZ_VERSION_MINOR >= 25))
#if MUPDF_VERSION_GE(1, 25, 0)
style.color = ch.m_internal->argb;
#else
style.color = ch.m_internal->color;
Expand All @@ -3058,6 +3096,9 @@ mupdf::FzRect JM_make_spanlist(
if (0
|| style.size != old_style.size
|| style.flags != old_style.flags
#if MUPDF_VERSION_GE(1, 25, 2)
|| (style.char_flags & ~FZ_STEXT_SYNTHETIC) != (old_style.char_flags & ~FZ_STEXT_SYNTHETIC)
#endif
|| style.color != old_style.color
|| strcmp(style.font, old_style.font) != 0
)
Expand Down Expand Up @@ -3095,6 +3136,9 @@ mupdf::FzRect JM_make_spanlist(

DICT_SETITEM_DROP(span, dictkey_size, Py_BuildValue("f", style.size));
DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("i", style.flags));
#if MUPDF_VERSION_GE(1, 25, 2)
DICT_SETITEM_DROP(span, dictkey_char_flags, Py_BuildValue("i", style.char_flags));
#endif
DICT_SETITEM_DROP(span, dictkey_font, JM_EscapeStrFromStr(style.font));
DICT_SETITEM_DROP(span, dictkey_color, Py_BuildValue("i", style.color));
DICT_SETITEMSTR_DROP(span, "ascender", Py_BuildValue("f", asc));
Expand Down
1 change: 1 addition & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,7 @@ def get_text(
"blocks": pymupdf.TEXTFLAGS_BLOCKS,
}
option = option.lower()
assert option in formats
if option not in formats:
option = "text"
if flags is None:
Expand Down
Binary file added tests/resources/test_4147.pdf
Binary file not shown.
27 changes: 27 additions & 0 deletions tests/test_textextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,3 +395,30 @@ def test_3725():
text = page.get_text()
if 0:
print(textwrap.indent(text, ' '))

def test_4147():
print()
items = list()
for expect_visible, path in (
(False, os.path.normpath(f'{__file__}/../../tests/resources/test_4147.pdf')),
(True, os.path.normpath(f'{__file__}/../../tests/resources/symbol-list.pdf')),
):
print(f'{expect_visible} {path=}')
with pymupdf.open(path) as document:
page = document[0]
text = page.get_text('rawdict')
for block in text['blocks']:
if block['type'] == 0:
for line in block['lines']:
for span in line['spans']:
#print(f' {span=}')
if pymupdf.mupdf_version_tuple >= (1, 25, 2):
print(f' span: {span["flags"]=:#x} {span["char_flags"]=:#x}')
if expect_visible:
assert span['char_flags'] & pymupdf.mupdf.FZ_STEXT_FILLED
else:
assert not (span['char_flags'] & pymupdf.mupdf.FZ_STEXT_FILLED)
assert not (span['char_flags'] & pymupdf.mupdf.FZ_STEXT_STROKED)
else:
print(f' span: {span["flags"]=:#x}')
assert 'char_flags' not in span

0 comments on commit 734dbe5

Please sign in to comment.