From 3c6756b3d5bd836c8b92cab037d2b8f917267820 Mon Sep 17 00:00:00 2001 From: Belyaeva Oksana Date: Fri, 20 Oct 2023 16:59:53 +0300 Subject: [PATCH] ESL-167 extract only word boxes --- .../pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py | 6 ++++-- .../pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py | 6 +++--- dedoc/scripts/test_words_bbox_extraction.py | 7 +++++++ 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py index 162a7b31..64bc2c93 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py @@ -28,9 +28,11 @@ def __split_image2bboxes(self, image: np.ndarray, page_num: int, language: str, output_dict = get_text_with_bbox_from_document_page(image, language, ocr_conf_threshold) height, width = image.shape[:2] + extract_line_bbox = self.config.get("labeling_mode", False) + line_boxes = [ - TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_annotations(width, height)) - for line_num, line in enumerate(output_dict.lines) + TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, + annotations=line.get_annotations(width, height, extract_line_bbox)) for line_num, line in enumerate(output_dict.lines) ] return line_boxes diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py index f4ddd595..be72d182 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py @@ -23,7 +23,7 @@ def __init__(self, order: int, bbox: BBox, words: List[OcrWord]) -> None: def text(self) -> str: return " ".join(word.text for word in self.words if word.text != "") + "\n" - def get_annotations(self, page_width: int, page_height: int) -> List[Annotation]: + def get_annotations(self, page_width: int, page_height: int, extract_line_bbox: bool) -> List[Annotation]: start = 0 annotations = [] @@ -35,8 +35,8 @@ def get_annotations(self, page_width: int, page_height: int) -> List[Annotation] annotations.append(ConfidenceAnnotation(start, end, str(word.confidence / 100))) annotations.append(BBoxAnnotation(start, end, word.bbox, page_width, page_height)) start += len(word.text) + 1 - - annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height)) + if extract_line_bbox: + annotations.append(BBoxAnnotation(0, start, self.bbox, page_width, page_height)) return annotations @staticmethod diff --git a/dedoc/scripts/test_words_bbox_extraction.py b/dedoc/scripts/test_words_bbox_extraction.py index 64a062dc..888c3273 100644 --- a/dedoc/scripts/test_words_bbox_extraction.py +++ b/dedoc/scripts/test_words_bbox_extraction.py @@ -171,6 +171,13 @@ def test_table_word_extraction(self): image = cv2.imread(self._get_abs_path(file_name)) image = rotate_image(image, page_angle) + + # draw boxes of content's words + structure = result["content"]["structure"] + word_annotations = self.__get_words_annotation(structure) + image = self.__draw_word_annotations(image, word_annotations) + + # draw boxes of table's words tables = result["content"]["tables"] if len(tables) > 0: image = self.__draw_tables_words(tables, image)