From ff13d2f59bfab0abafd657b654887f5a19d7fae7 Mon Sep 17 00:00:00 2001 From: geisserml Date: Tue, 31 May 2022 21:49:23 +0200 Subject: [PATCH] text_extractor: make decoding more robust --- src/pypdfium2/_helpers/text_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypdfium2/_helpers/text_extractor.py b/src/pypdfium2/_helpers/text_extractor.py index 3e2fdfee6..2f825b6c9 100644 --- a/src/pypdfium2/_helpers/text_extractor.py +++ b/src/pypdfium2/_helpers/text_extractor.py @@ -52,7 +52,7 @@ def get_text(self, left=0, bottom=0, right=0, top=0): c_array = (ctypes.c_ushort * (n_chars+1))() pdfium.FPDFText_GetBoundedText(*args, ctypes.cast(c_array, ctypes.POINTER(ctypes.c_ushort)), n_chars) - text = bytes(c_array).decode("utf-16-le")[:-1] + text = bytes(c_array).decode("utf-16-le", errors="ignore")[:-1] return text