diff --git a/docs/textpage.rst b/docs/textpage.rst index a06d7e4eb..ecbc1ab34 100644 --- a/docs/textpage.rst +++ b/docs/textpage.rst @@ -288,6 +288,7 @@ ascender ascender of the font *(float)* descender descender of the font *(float)* size font size *(float)* flags font characteristics *(int)* +char_flags char characteristics *(int)* color text color in sRGB format *(int)* text (only for :meth:`extractDICT`) text *(str)* chars (only for :meth:`extractRAWDICT`) *list* of character dictionaries @@ -335,6 +336,21 @@ Test these characteristics like so: Bits 1 thru 4 are font properties, i.e. encoded in the font program. Please note, that this information is not necessarily correct or complete: fonts quite often contain wrong data here. +*"char_flags"* is an integer, which represents extra character properties: + +* bit 0: strikeout. +* bit 1: underline. +* bit 2: synthetic. +* bit 3: filled. +* bit 4: stroked. +* bit 5: clipped. + +For example if not filled and not stroked (`if not (char_flags & 2**3 & 2**4): +...`) then the text will be invisible. + +(`char_flags` is new in v1.25.2.) + + Character Dictionary for :meth:`extractRAWDICT` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/__init__.py b/src/__init__.py index 7075a33d2..5e949dd81 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -13619,6 +13619,7 @@ class EmptyFileError(FileDataError): dictkey_filename = "filename" dictkey_fill = "fill" dictkey_flags = "flags" +dictkey_char_flags = "char_flags" dictkey_font = "font" dictkey_glyph = "glyph" dictkey_height = "height" @@ -14669,7 +14670,9 @@ def JM_char_bbox(line, ch): def JM_char_font_flags(font, line, ch): - flags = detect_super_script(line, ch) + flags = 0 + if line and ch: + flags += detect_super_script(line, ch) flags += mupdf.fz_font_is_italic(font) * TEXT_FONT_ITALIC flags += mupdf.fz_font_is_serif(font) * TEXT_FONT_SERIFED flags += mupdf.fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED @@ -16391,6 +16394,8 @@ def __init__(self, rhs=None): if rhs: self.size = rhs.size self.flags = rhs.flags + if mupdf_version_tuple >= (1, 25, 2): + self.char_flags = rhs.char_flags self.font = rhs.font self.color = rhs.color self.asc = rhs.asc @@ -16398,12 +16403,18 @@ def __init__(self, rhs=None): else: self.size = -1 self.flags = -1 + if mupdf_version_tuple >= (1, 25, 2): + self.char_flags = -1 self.font = '' self.color = -1 self.asc = 0 self.desc = 0 def __str__(self): - return f'{self.size} {self.flags} {self.font} {self.color} {self.asc} {self.desc}' + ret = f'{self.size} {self.flags}' + if mupdf_version_tuple >= (1, 25, 2): + ret += f' {self.char_flags}' + ret += f' {self.font} {self.color} {self.asc} {self.desc}' + return ret old_style = char_style() style = char_style() @@ -16418,10 +16429,19 @@ def __str__(self): ): continue + # Info from: + # detect_super_script() + # fz_font_is_italic() + # fz_font_is_serif() + # fz_font_is_monospaced() + # fz_font_is_bold() + flags = JM_char_font_flags(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)), line, ch) origin = mupdf.FzPoint(ch.m_internal.origin) style.size = ch.m_internal.size style.flags = flags + if mupdf_version_tuple >= (1, 25, 2): + style.char_flags = ch.m_internal.flags style.font = JM_font_name(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font))) if mupdf_version_tuple >= (1, 25): style.color = ch.m_internal.argb @@ -16432,6 +16452,10 @@ def __str__(self): if (style.size != old_style.size or style.flags != old_style.flags + or (mupdf_version_tuple >= (1, 25, 2) + and (style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC) + != (old_style.char_flags & ~mupdf.FZ_STEXT_SYNTHETIC) + ) or style.color != old_style.color or style.font != old_style.font ): @@ -16461,6 +16485,8 @@ def __str__(self): span[dictkey_size] = style.size span[dictkey_flags] = style.flags + if mupdf_version_tuple >= (1, 25, 2): + span[dictkey_char_flags] = style.char_flags span[dictkey_font] = JM_EscapeStrFromStr(style.font) span[dictkey_color] = style.color span["ascender"] = asc @@ -18696,7 +18722,7 @@ def jm_trace_text_span(dev, span, type_, ctm, colorspace, color, alpha, seqno): chars = tuple(chars) if not space_adv: - if not mono: + if not (fflags & TEXT_FONT_MONOSPACED): c, out_font = mupdf.fz_encode_character_with_fallback( span.font(), 32, 0, 0) space_adv = mupdf.fz_advance_glyph( span.font(), diff --git a/src/extra.i b/src/extra.i index f8f88a6c5..00b67ffa1 100644 --- a/src/extra.i +++ b/src/extra.i @@ -43,6 +43,7 @@ otherwise compilation can fail because free() and malloc() are not declared. */ dictkey_filename = PyUnicode_InternFromString("filename"); dictkey_fill = PyUnicode_InternFromString("fill"); dictkey_flags = PyUnicode_InternFromString("flags"); + dictkey_char_flags = PyUnicode_InternFromString("char_flags"); /* Only used with mupdf >= 1.25.2. */ dictkey_font = PyUnicode_InternFromString("font"); dictkey_glyph = PyUnicode_InternFromString("glyph"); dictkey_height = PyUnicode_InternFromString("height"); @@ -103,6 +104,14 @@ catch(...) { #include +#define MAKE_MUPDF_VERSION_INT(major, minor, patch) ((major << 16) + (minor << 8) + (patch << 0)) + +#define MUPDF_VERSION_INT MAKE_MUPDF_VERSION_INT(FZ_VERSION_MAJOR, FZ_VERSION_MINOR, FZ_VERSION_PATCH) + +#define MUPDF_VERSION_GE(major, minor, patch) \ + MUPDF_VERSION_INT >= MAKE_MUPDF_VERSION_INT(major, minor, patch) + + /* Returns equivalent of `repr(x)`. */ static std::string repr(PyObject* x) { @@ -837,6 +846,7 @@ PyObject* dictkey_ext = NULL; PyObject* dictkey_filename = NULL; PyObject* dictkey_fill = NULL; PyObject* dictkey_flags = NULL; +PyObject* dictkey_char_flags = NULL; PyObject* dictkey_font = NULL; PyObject* dictkey_glyph = NULL; PyObject* dictkey_height = NULL; @@ -1712,6 +1722,29 @@ static const char* JM_font_name(fz_font* font) return s + 1; } +static int detect_super_script(fz_stext_line *line, fz_stext_char *ch) +{ + if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0) + { + return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f; + } + return 0; +} + +static int JM_char_font_flags(fz_font *font, fz_stext_line *line, fz_stext_char *ch) +{ + int flags = 0; + if (line && ch) + { + flags += detect_super_script(line, ch) * TEXT_FONT_SUPERSCRIPT; + } + flags += mupdf::ll_fz_font_is_italic(font) * TEXT_FONT_ITALIC; + flags += mupdf::ll_fz_font_is_serif(font) * TEXT_FONT_SERIFED; + flags += mupdf::ll_fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED; + flags += mupdf::ll_fz_font_is_bold(font) * TEXT_FONT_BOLD; + return flags; +} + static void jm_trace_text_span( jm_tracedraw_device* dev, fz_text_span* span, @@ -1827,7 +1860,7 @@ static void jm_trace_text_span( } if (!space_adv) { - if (!mono) + if (!(fflags & TEXT_FONT_MONOSPACED)) { fz_font* out_font = nullptr; space_adv = mupdf::ll_fz_advance_glyph( @@ -2957,25 +2990,6 @@ PyObject* get_cdrawings(mupdf::FzPage& page, PyObject *extended=NULL, PyObject * } -static int detect_super_script(fz_stext_line *line, fz_stext_char *ch) -{ - if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0) - { - return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f; - } - return 0; -} - -static int JM_char_font_flags(fz_font *font, fz_stext_line *line, fz_stext_char *ch) -{ - int flags = detect_super_script(line, ch); - flags += mupdf::ll_fz_font_is_italic(font) * TEXT_FONT_ITALIC; - flags += mupdf::ll_fz_font_is_serif(font) * TEXT_FONT_SERIFED; - flags += mupdf::ll_fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED; - flags += mupdf::ll_fz_font_is_bold(font) * TEXT_FONT_BOLD; - return flags; -} - //--------------------------------------------------------------------------- // APPEND non-ascii runes in unicode escape format to fz_buffer //--------------------------------------------------------------------------- @@ -3027,6 +3041,20 @@ mupdf::FzRect JM_make_spanlist( { float size = -1; int flags = -1; + + #if MUPDF_VERSION_GE(1, 25, 2) + /* From mupdf:include/mupdf/fitz/structured-text.h:fz_stext_char::flags, which + uses anonymous enum values: + FZ_STEXT_STRIKEOUT = 1, + FZ_STEXT_UNDERLINE = 2, + FZ_STEXT_SYNTHETIC = 4, + FZ_STEXT_FILLED = 16, + FZ_STEXT_STROKED = 32, + FZ_STEXT_CLIPPED = 64 + */ + int char_flags; + #endif + const char *font = ""; unsigned int color = -1; float asc = 0; @@ -3042,12 +3070,22 @@ mupdf::FzRect JM_make_spanlist( { continue; } + /* Info from: + detect_super_script() + fz_font_is_italic() + fz_font_is_serif() + fz_font_is_monospaced() + fz_font_is_bold() + */ int flags = JM_char_font_flags( ch.m_internal->font, line.m_internal, ch.m_internal); fz_point origin = ch.m_internal->origin; style.size = ch.m_internal->size; style.flags = flags; + #if MUPDF_VERSION_GE(1, 25, 2) + style.char_flags = ch.m_internal->flags; + #endif style.font = JM_font_name(ch.m_internal->font); - #if (FZ_VERSION_MAJOR > 1 || (FZ_VERSION_MAJOR == 1 && FZ_VERSION_MINOR >= 25)) + #if MUPDF_VERSION_GE(1, 25, 0) style.color = ch.m_internal->argb; #else style.color = ch.m_internal->color; @@ -3058,6 +3096,9 @@ mupdf::FzRect JM_make_spanlist( if (0 || style.size != old_style.size || style.flags != old_style.flags + #if MUPDF_VERSION_GE(1, 25, 2) + || (style.char_flags & ~FZ_STEXT_SYNTHETIC) != (old_style.char_flags & ~FZ_STEXT_SYNTHETIC) + #endif || style.color != old_style.color || strcmp(style.font, old_style.font) != 0 ) @@ -3095,6 +3136,9 @@ mupdf::FzRect JM_make_spanlist( DICT_SETITEM_DROP(span, dictkey_size, Py_BuildValue("f", style.size)); DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("i", style.flags)); + #if MUPDF_VERSION_GE(1, 25, 2) + DICT_SETITEM_DROP(span, dictkey_char_flags, Py_BuildValue("i", style.char_flags)); + #endif DICT_SETITEM_DROP(span, dictkey_font, JM_EscapeStrFromStr(style.font)); DICT_SETITEM_DROP(span, dictkey_color, Py_BuildValue("i", style.color)); DICT_SETITEMSTR_DROP(span, "ascender", Py_BuildValue("f", asc)); diff --git a/src/utils.py b/src/utils.py index 4c3652a0d..c5f3dd521 100644 --- a/src/utils.py +++ b/src/utils.py @@ -929,6 +929,7 @@ def get_text( "blocks": pymupdf.TEXTFLAGS_BLOCKS, } option = option.lower() + assert option in formats if option not in formats: option = "text" if flags is None: diff --git a/tests/resources/test_4147.pdf b/tests/resources/test_4147.pdf new file mode 100644 index 000000000..4d2c38606 Binary files /dev/null and b/tests/resources/test_4147.pdf differ diff --git a/tests/test_textextract.py b/tests/test_textextract.py index 25845bd5e..19031f709 100644 --- a/tests/test_textextract.py +++ b/tests/test_textextract.py @@ -395,3 +395,30 @@ def test_3725(): text = page.get_text() if 0: print(textwrap.indent(text, ' ')) + +def test_4147(): + print() + items = list() + for expect_visible, path in ( + (False, os.path.normpath(f'{__file__}/../../tests/resources/test_4147.pdf')), + (True, os.path.normpath(f'{__file__}/../../tests/resources/symbol-list.pdf')), + ): + print(f'{expect_visible} {path=}') + with pymupdf.open(path) as document: + page = document[0] + text = page.get_text('rawdict') + for block in text['blocks']: + if block['type'] == 0: + for line in block['lines']: + for span in line['spans']: + #print(f' {span=}') + if pymupdf.mupdf_version_tuple >= (1, 25, 2): + print(f' span: {span["flags"]=:#x} {span["char_flags"]=:#x}') + if expect_visible: + assert span['char_flags'] & pymupdf.mupdf.FZ_STEXT_FILLED + else: + assert not (span['char_flags'] & pymupdf.mupdf.FZ_STEXT_FILLED) + assert not (span['char_flags'] & pymupdf.mupdf.FZ_STEXT_STROKED) + else: + print(f' span: {span["flags"]=:#x}') + assert 'char_flags' not in span