Merge pull request #214 from OpenPecha/rotation

fix(pecha): handle box rotation
OpenPecha · Nov 7, 2022 · 552c4ad · 552c4ad
2 parents 08a902c + 182c674
commit 552c4ad
Show file tree

Hide file tree

Showing 18 changed files with 368 additions and 133 deletions.
diff --git a/openpecha/core/metadata.py b/openpecha/core/metadata.py
@@ -22,7 +22,7 @@ class CopyrightStatus(Enum):
 class Copyright(BaseModel):
     status: CopyrightStatus = CopyrightStatus.UNKNOWN
     notice: Optional[str] = ""
-    info_url: Optional[AnyHttpUrl] = None
+    info_url: Optional[str] = None
 
     class Config:
         extra = Extra.forbid

diff --git a/openpecha/core/pecha.py b/openpecha/core/pecha.py
@@ -1,4 +1,3 @@
-import json
 import shutil
 import warnings
 from collections import defaultdict
@@ -322,7 +321,7 @@ def _read_components(self):
         return res
 
     def save_meta(self):
-        dump_yaml(json.loads(self.meta.json()), self.meta_fn)
+        dump_yaml(self.meta.dict(exclude_none=True), self.meta_fn)
 
     def save_single_base(self, base_name: str, content: str = None):
         if not content:
@@ -336,7 +335,7 @@ def save_base(self):
 
     def save_layer(self, base_name: str, layer_name: LayerEnum, layer: Layer):
         layer_fn = self._mkdir(self.layers_path / base_name) / f"{layer_name.value}.yml"
-        dump_yaml(json.loads(layer.json()), layer_fn)
+        dump_yaml(layer.dict(exclude_none=True), layer_fn)
         return layer_fn
 
     def save_layers(self):
@@ -346,7 +345,7 @@ def save_layers(self):
 
     def save_index(self):
         try:
-            dump_yaml(json.loads(self.index.json()), self.index_fn)
+            dump_yaml(self.index.dict(exclude_none=True), self.index_fn)
         except FileNotFoundError:
             pass
 

diff --git a/openpecha/formatters/layers.py b/openpecha/formatters/layers.py
@@ -3,6 +3,7 @@
 """
 
 from collections import namedtuple
+from enum import Enum
 
 __all__ = [
     "Layer",
@@ -30,7 +31,7 @@
 ]
 
 
-class AnnType:
+class AnnType(Enum):
     book_title = "BookTitle"
     sub_title = "SubTitle"
     book_number = "BookNumber"
@@ -54,7 +55,6 @@ class AnnType:
     durchen = "Durchen"
     footnote = "Footnote"
 
-
 class _attr_names:
     # Layer
     ID = "id"  # Uique id for annotation of specific Pecha or Abstract work. type: str

diff --git a/openpecha/formatters/ocr/google_vision.py b/openpecha/formatters/ocr/google_vision.py
@@ -81,6 +81,50 @@ def get_language_code_from_gv_poly(self, gv_poly):
         # English is a kind of default for our purpose
         return "en"
 
+    @staticmethod
+    def get_bboxinfo_from_vertices(vertices):
+        """
+        Vertices do not always have dots in the same order. The current hypothesis
+        is that their order represents the rotation of characters detected by
+        the OCR.
+
+        This is not documented on
+
+        https://cloud.google.com/vision/docs/reference/rest/v1/projects.locations.products.referenceImages#Vertex
+
+        though, so use the angle value with caution.
+        """
+        if len(vertices) == 0:
+            return None
+        idx_smallest = -1
+        smallest_x = -1
+        smallest_y = -1
+        largest_x = -1
+        largest_y = -1
+        for idx, v in enumerate(vertices):
+            if "x" not in v or "y" not in v:
+                continue
+            smallest_x = v["x"] if smallest_x == -1 else min(v["x"], smallest_x)
+            smallest_y = v["y"] if smallest_y == -1 else min(v["y"], smallest_y)
+            largest_x = max(v["x"], largest_x)
+            largest_y = max(v["y"], largest_y)
+            # here we have to account for cases where the 4 dots don't form a rectangle
+            # because coordinates are shifted by 1, see test_bbox_info for some example
+            if abs(v["x"] - smallest_x) < 3 and abs(v["y"] - smallest_y) < 3:
+                idx_smallest = idx
+        if smallest_x == -1 or smallest_y == -1 or largest_y == -1 or largest_x == -1:
+            return None
+        angle = None
+        if len(vertices) == 4 and idx_smallest != -1:
+            angle = 0
+            if idx_smallest == 1:
+                angle = 270
+            if idx_smallest == 2:
+                angle = 180
+            if idx_smallest == 3:
+                angle = 90
+        return [smallest_x, largest_x, smallest_y, largest_y, angle]
+
     def dict_to_bbox(self, word):
         """Convert bounding bbox to BBox object
 
@@ -90,21 +134,32 @@ def dict_to_bbox(self, word):
         Returns:
             obj: BBox object of bounding bbox
         """
-        text = word.get('text', '')
         confidence = word.get('confidence')
-        # the language returned by Google OCR is not particularly helpful
-        # language = self.get_language_code_from_gv_poly(word)
-        # instead we use our custom detection system
-        language = self.get_main_language_code(text)
         if 'boundingBox' not in word or 'vertices' not in word['boundingBox']:
             return None
         vertices = word['boundingBox']['vertices']
-        if len(vertices) != 4 or 'x' not in vertices[0] or 'x' not in vertices[1] or 'y' not in vertices[0] or 'y' not in vertices[2]:
+        bboxinfo = GoogleVisionFormatter.get_bboxinfo_from_vertices(vertices)
+        if bboxinfo == None:
             return None
-        return BBox(vertices[0]['x'], vertices[1]['x'], vertices[0]['y'], vertices[2]['y'], 
-            text=text, 
-            confidence=confidence, 
-            language=language)
+        if self.remove_rotated_boxes and bboxinfo[4] > 0:
+            return None
+        return BBox(bboxinfo[0], bboxinfo[1], bboxinfo[2], bboxinfo[3], bboxinfo[4], 
+            confidence=confidence)
+
+    @staticmethod
+    def get_width_of_vertices(vertices):
+        if len(vertices) < 4:
+            return None
+        smallest_x = -1
+        largest_x = -1
+        for v in vertices:
+            if "x" not in v or "y" not in v:
+                continue
+            smallest_x = v["x"] if smallest_x == -1 else min(v["x"], smallest_x)
+            largest_x = max(v["x"], largest_x)
+        if smallest_x == -1:
+            return None
+        return largest_x - smallest_x
 
     def get_char_base_bboxes_and_avg_width(self, response):
         """Return bounding bboxs in page response
@@ -116,29 +171,35 @@ def get_char_base_bboxes_and_avg_width(self, response):
             list: list of BBox object which saves required info of a bounding bbox
         """
         bboxes = []
-        cur_word = ""
         widths = []
         for page in response['fullTextAnnotation']['pages']:
             for block in page['blocks']:
                 for paragraph in block['paragraphs']:
                     for word in paragraph['words']:
+                        bbox = self.dict_to_bbox(word)
+                        if bbox is None:
+                            # case where we ignore the bbox for some reason
+                            # for instance rotated text
+                            continue
+                        cur_word = ""
                         for symbol in word['symbols']:
                             symbolunicat = unicodedata.category(symbol['text'][0])
                             if symbolunicat in UNICODE_CHARCAT_FOR_WIDTH:
                                 vertices = symbol['boundingBox']['vertices']
-                                if len(vertices) < 2 or 'x' not in vertices[0] or 'x' not in vertices[1]:
-                                    logging.debug("symbol box with no coodinates, ignore for average width")
-                                    continue
-                                logging.debug("consider '%s' (cat %s) for avg width", symbol['text'], symbolunicat)
-                                widths.append(vertices[1]['x'] - vertices[0]['x'])
+                                width = GoogleVisionFormatter.get_width_of_vertices(vertices)
+                                if width > 0:
+                                    widths.append(width)
                             cur_word += symbol['text']
                             if self.has_space_attached(symbol):
                                 cur_word += " "
-                        word['text'] = cur_word
-                        cur_word = ""
-                        bbox = self.dict_to_bbox(word)
-                        bboxes.append(bbox)
-        avg_width = statistics.mean(widths)
+                        if cur_word:
+                            bbox.text = cur_word
+                            # the language returned by Google OCR is not particularly helpful
+                            # language = self.get_language_code_from_gv_poly(word)
+                            # instead we use our custom detection system
+                            bbox.language = self.get_main_language_code(cur_word)
+                            bboxes.append(bbox)
+        avg_width = statistics.mean(widths) if widths else None
         logging.debug("average char width: %f", avg_width)
         return bboxes, avg_width
 

diff --git a/openpecha/formatters/ocr/hocr.py b/openpecha/formatters/ocr/hocr.py
@@ -170,12 +170,6 @@ def __init__(self, mode=None, output_path=None, metadata=None):
         super().__init__(output_path, metadata)
         self.mode = mode
         self.word_span = 0
-
-    def get_confidence(self, word_box):
-        confidence_info = word_box['title']
-        confidence = float(int(re.search(r"x_wconf (\d+)", confidence_info).group(1))/100)
-        return confidence
-
 
     def get_word_text_with_space(self, line_text, word_box):
         """check for space after word_text using line_text, add space to word_text if found 
@@ -202,19 +196,39 @@ def parse_box(self, line_box, word_box):
             box : bbox for text in word_box with vertices, confidence, language 
         """
         line_text = line_box.text
-        try:
-            vertices_info = word_box['title'].split(';')[0]
-        except:
+        if not word_box.has_attr('title'):
+            return None
+        boxinfos = word_box['title'].split(';')
+        coords = None
+        angle = None
+        confidence = None
+        for boxinfo in boxinfos:
+            boxinfo_parts = boxinfo.strip().split(" ")
+            if boxinfo_parts[0] == "bbox":
+                # in HOCR's, bbox order is x0, y0, x1, y1
+                coords = [
+                    int(boxinfo_parts[1]), 
+                    int(boxinfo_parts[2]),
+                    int(boxinfo_parts[3]),
+                    int(boxinfo_parts[4])
+                    ]
+            elif boxinfo_parts[0] == "textangle":
+                # angle is indicated counter-clockwise in hocr so
+                # we need to convert it to our internal value system:
+                angle = int(boxinfo_parts[1])
+                if textangle != 0:
+                    angle = 360 - angle
+            elif boxinfo_parts[0] == "x_wconf":
+                confidence = float(boxinfo_parts[1]) / 100.0
+        if coords is None:
+            return None
+        if self.remove_rotated_boxes and angle is not None and angle > 0:
             return None
-        vertices_coordinates = vertices_info.split(" ")
-        x1 = int(vertices_coordinates[1])
-        y1 = int(vertices_coordinates[2])
-        x2 = int(vertices_coordinates[3])
-        y2 = int(vertices_coordinates[4])
-        confidence = self.get_confidence(word_box)
         language = self.get_main_language_code(word_box.text)
         text = self.get_word_text_with_space(line_text, word_box)
-        box = BBox(x1, x2, y1, y2,
+        # but we initialize as x1, x2, y1, y2
+        box = BBox(coords[0], coords[2], coords[1], coords[3],
+            angle = angle,
             text=text,
             confidence=confidence,
             language=language
@@ -237,7 +251,9 @@ def get_boxes(self, hocr_page_html):
             self.word_span = 0
             word_boxes = line_box.find_all("span", {"class": "ocrx_word"})
             for word_box in word_boxes:
-                bboxes.append(self.parse_box(line_box,word_box))
+                bbox = self.parse_box(line_box,word_box)
+                if bbox is not None:
+                    bboxes.append(bbox)
         return bboxes
 
     def get_boxes_for_IA(self, page_html):
@@ -257,7 +273,9 @@ def get_boxes_for_IA(self, page_html):
                 self.word_span = 0
                 word_boxes = line_box.find_all("span", {"class": "ocrx_word"})
                 for word_box in word_boxes:
-                    bboxes.append(self.parse_box(line_box, word_box))
+                    bbox = self.parse_box(line_box,word_box)
+                    if bbox is not None:
+                        bboxes.append(bbox)
         return bboxes
 
 

diff --git a/openpecha/formatters/ocr/ocr.py b/openpecha/formatters/ocr/ocr.py
@@ -40,12 +40,13 @@
 SAME_LINE_RATIO_THRESHOLD = 0.2
 
 class BBox:
-    def __init__(self, x1: int, x2: int, y1: int, y2: int, text: str = None, confidence: float = None, language: str = NO_LANG):
+    def __init__(self, x1: int, x2: int, y1: int, y2: int, angle: int = None, text: str = None, confidence: float = None, language: str = NO_LANG):
         self.text = text
         self.x1 = x1
         self.x2 = x2
         self.y1 = y1
         self.y2 = y2
+        self.angle = angle
         self.confidence = confidence
         self.language = language
         self.mid_y = (y1 + y2) / 2
@@ -57,6 +58,14 @@ def get_height(self):
     def get_width(self):
         return self.x2 - self.x1
 
+    def get_angle(self):
+        """
+        Returns the angle of the BBox. The value is either None (when no angle can be determined)
+        or the value of the clockwise rotation, in positive degrees (the value must be between 0 and 359).
+        A value of 0 represents straight characters.
+        """
+        return self.angle
+
     def get_box_orientation(self):
         width = self.x2 - self.x1
         length = self.y2 - self.y1
@@ -113,6 +122,7 @@ def __init__(self, output_path=None, metadata=None):
         self.create_language_layer = True
         self.ocr_confidence_threshold = ANNOTATION_MINIMAL_CONFIDENCE
         self.language_annotation_min_len = ANNOTATION_MINIMAL_LEN
+        self.remove_rotated_boxes = True
 
     def text_preprocess(self, text):
         return text
@@ -280,6 +290,7 @@ def has_space_after(self, cur_bbox, next_bbox, avg_char_width):
                 next_bbox.x1,
                 cur_bbox.y1, # the y coordinates are kind of arbitrary
                 cur_bbox.y2,
+                angle=0,
                 text=" ",
                 confidence=None,
                 language=None
@@ -440,7 +451,7 @@ def build_page(self, bboxes, image_number, image_filename, state, avg_char_width
         if mean_page_confidence < self.ocr_confidence_threshold or nb_below_threshold > self.max_low_conf_per_page:
             state["low_confidence_annotations"][self.get_unique_id()] = OCRConfidence(
                 span=Span(start=page_start_cc, end=state["base_layer_len"]), 
-                confidence=mean_page_confidence, nb_below_threshold=nb_below_threshold)
+                confidence=mean_page_confidence, nb_below_threshold=nb_below_threshold if nb_below_threshold else None)
         else:
             self.merge_page_low_confidence_annotations(state["page_low_confidence_annotations"], state["low_confidence_annotations"])
             state["page_low_confidence_annotations"] = []
@@ -609,6 +620,7 @@ def create_opf(self, data_provider, pecha_id=None, opf_options = {}, ocr_import_
         self.data_provider = data_provider
 
         self.remove_non_character_lines = opf_options["remove_non_character_lines"] if "remove_non_character_lines" in opf_options else True
+        self.remove_rotated_boxes = opf_options["remove_rotated_boxes"] if "remove_rotated_boxes" in opf_options else True
         self.create_language_layer = opf_options["create_language_layer"] if "create_language_layer" in opf_options else True
         self.ocr_confidence_threshold = opf_options["ocr_confidence_threshold"] if "ocr_confidence_threshold" in opf_options else ANNOTATION_MINIMAL_CONFIDENCE
         self.language_annotation_min_len = opf_options["language_annotation_min_len"] if "language_annotation_min_len" in opf_options else ANNOTATION_MINIMAL_LEN