From dcc4bad990af7c06ce140afd452ca1d404bfc759 Mon Sep 17 00:00:00 2001 From: eroux Date: Thu, 3 Nov 2022 13:53:48 +0100 Subject: [PATCH 1/7] WIP: handle rotation --- openpecha/formatters/ocr/google_vision.py | 87 ++++++++++++++++--- openpecha/formatters/ocr/hocr.py | 40 +++++---- openpecha/formatters/ocr/ocr.py | 14 ++- .../opf_expected_datas/expected_base_text.txt | 6 +- 4 files changed, 115 insertions(+), 32 deletions(-) diff --git a/openpecha/formatters/ocr/google_vision.py b/openpecha/formatters/ocr/google_vision.py index 7ab63838..60d817ea 100644 --- a/openpecha/formatters/ocr/google_vision.py +++ b/openpecha/formatters/ocr/google_vision.py @@ -81,6 +81,50 @@ def get_language_code_from_gv_poly(self, gv_poly): # English is a kind of default for our purpose return "en" + @staticmethod + def get_bboxinfo_from_vertices(vertices): + """ + Vertices do not always have dots in the same order. The current hypothesis + is that their order represents the rotation of characters detected by + the OCR. + + This is not documented on + + https://cloud.google.com/vision/docs/reference/rest/v1/projects.locations.products.referenceImages#Vertex + + though, so use the angle value with caution. + """ + if len(vertices) == 0: + return None + idx_smallest = -1 + smallest_x = -1 + smallest_y = -1 + largest_x = -1 + largest_y = -1 + for idx, v in enumerate(vertices): + if "x" not in v or "y" not in v: + continue + smallest_x = v["x"] if smallest_x == -1 else min(v["x"], smallest_x) + smallest_y = v["y"] if smallest_y == -1 else min(v["y"], smallest_y) + largest_x = max(v["x"], largest_x) + largest_y = max(v["y"], largest_y) + # here we have to account for cases where the 4 dots don't form a rectangle + # because coordinates are shifted by 1, see test_bbox_info for some example + if abs(v["x"] - smallest_x) < 3 and abs(v["y"] - smallest_y) < 3: + idx_smallest = idx + if smallest_x == -1 or smallest_y == -1 or largest_y == -1 or largest_x == -1: + return None + angle = None + if len(vertices) == 4 and idx_smallest != -1: + angle = 0 + if idx_smallest == 1: + angle = 270 + if idx_smallest == 2: + angle = 180 + if idx_smallest == 3: + angle = 90 + return [smallest_x, largest_x, smallest_y, largest_y, angle] + def dict_to_bbox(self, word): """Convert bounding bbox to BBox object @@ -99,13 +143,31 @@ def dict_to_bbox(self, word): if 'boundingBox' not in word or 'vertices' not in word['boundingBox']: return None vertices = word['boundingBox']['vertices'] - if len(vertices) != 4 or 'x' not in vertices[0] or 'x' not in vertices[1] or 'y' not in vertices[0] or 'y' not in vertices[2]: + bboxinfo = GoogleVisionFormatter.get_bboxinfo_from_vertices(vertices) + if bboxinfo == None: return None - return BBox(vertices[0]['x'], vertices[1]['x'], vertices[0]['y'], vertices[2]['y'], + if self.remove_rotated_boxes and bboxinfo[4] > 0: + return None + return BBox(bboxinfo[0], bboxinfo[1], bboxinfo[2], bboxinfo[3], bboxinfo[4], text=text, confidence=confidence, language=language) + @staticmethod + def get_width_of_vertices(vertices): + if len(vertices) < 4: + return None + smallest_x = -1 + largest_x = -1 + for v in vertices: + if "x" not in v or "y" not in v: + continue + smallest_x = v["x"] if smallest_x == -1 else min(v["x"], smallest_x) + largest_x = max(v["x"], largest_x) + if smallest_x == -1: + return None + return largest_x - smallest_x + def get_char_base_bboxes_and_avg_width(self, response): """Return bounding bboxs in page response @@ -116,29 +178,30 @@ def get_char_base_bboxes_and_avg_width(self, response): list: list of BBox object which saves required info of a bounding bbox """ bboxes = [] - cur_word = "" widths = [] for page in response['fullTextAnnotation']['pages']: for block in page['blocks']: for paragraph in block['paragraphs']: for word in paragraph['words']: + bbox = self.dict_to_bbox(word) + if bbox is None: + # case where we ignore the bbox for some reason + # for instance rotated text + continue + cur_word = "" for symbol in word['symbols']: symbolunicat = unicodedata.category(symbol['text'][0]) if symbolunicat in UNICODE_CHARCAT_FOR_WIDTH: vertices = symbol['boundingBox']['vertices'] - if len(vertices) < 2 or 'x' not in vertices[0] or 'x' not in vertices[1]: - logging.debug("symbol box with no coodinates, ignore for average width") - continue - logging.debug("consider '%s' (cat %s) for avg width", symbol['text'], symbolunicat) - widths.append(vertices[1]['x'] - vertices[0]['x']) + width = GoogleVisionFormatter.get_width_of_vertices(vertices) + if width > 0: + widths.append(width) cur_word += symbol['text'] if self.has_space_attached(symbol): cur_word += " " - word['text'] = cur_word - cur_word = "" - bbox = self.dict_to_bbox(word) + bbox.text = cur_word bboxes.append(bbox) - avg_width = statistics.mean(widths) + avg_width = statistics.mean(widths) if widths else None logging.debug("average char width: %f", avg_width) return bboxes, avg_width diff --git a/openpecha/formatters/ocr/hocr.py b/openpecha/formatters/ocr/hocr.py index 91ce1afe..b00cfd1a 100644 --- a/openpecha/formatters/ocr/hocr.py +++ b/openpecha/formatters/ocr/hocr.py @@ -170,12 +170,6 @@ def __init__(self, mode=None, output_path=None, metadata=None): super().__init__(output_path, metadata) self.mode = mode self.word_span = 0 - - def get_confidence(self, word_box): - confidence_info = word_box['title'] - confidence = float(int(re.search(r"x_wconf (\d+)", confidence_info).group(1))/100) - return confidence - def get_word_text_with_space(self, line_text, word_box): """check for space after word_text using line_text, add space to word_text if found @@ -202,19 +196,33 @@ def parse_box(self, line_box, word_box): box : bbox for text in word_box with vertices, confidence, language """ line_text = line_box.text - try: - vertices_info = word_box['title'].split(';')[0] - except: + if 'title' not in word_box: + return None + boxinfos = word_box['title'].split(';') + coords = None + angle = None + confidence = None + for boxinfo in boxinfos: + boxinfo_parts = boxinfo.split(" ") + if boxinfo_parts[0] == "bbox": + coords = [ + int(boxinfo_parts[1]), + int(boxinfo_parts[2]), + int(boxinfo_parts[3]), + int(boxinfo_parts[4]) + ] + if boxinfo_parts[0] == "textangle": + # angle is indicated counter-clockwise in hocr so + # we need to convert it to our internal value system: + angle = 360 - int(boxinfo_parts[1]) + if boxinfo_parts[0] == "x_wconf": + confidence = float(boxinfo_parts[1]) + if coords is None: return None - vertices_coordinates = vertices_info.split(" ") - x1 = int(vertices_coordinates[1]) - y1 = int(vertices_coordinates[2]) - x2 = int(vertices_coordinates[3]) - y2 = int(vertices_coordinates[4]) - confidence = self.get_confidence(word_box) language = self.get_main_language_code(word_box.text) text = self.get_word_text_with_space(line_text, word_box) - box = BBox(x1, x2, y1, y2, + box = BBox(coords[0], coords[1], coords[2], coords[3], + angle = angle, text=text, confidence=confidence, language=language diff --git a/openpecha/formatters/ocr/ocr.py b/openpecha/formatters/ocr/ocr.py index 168f2f9e..efeceec9 100644 --- a/openpecha/formatters/ocr/ocr.py +++ b/openpecha/formatters/ocr/ocr.py @@ -40,12 +40,13 @@ SAME_LINE_RATIO_THRESHOLD = 0.2 class BBox: - def __init__(self, x1: int, x2: int, y1: int, y2: int, text: str = None, confidence: float = None, language: str = NO_LANG): + def __init__(self, x1: int, x2: int, y1: int, y2: int, angle: int = None, text: str = None, confidence: float = None, language: str = NO_LANG): self.text = text self.x1 = x1 self.x2 = x2 self.y1 = y1 self.y2 = y2 + self.angle = angle self.confidence = confidence self.language = language self.mid_y = (y1 + y2) / 2 @@ -57,6 +58,14 @@ def get_height(self): def get_width(self): return self.x2 - self.x1 + def get_angle(self): + """ + Returns the angle of the BBox. The value is either None (when no angle can be determined) + or the value of the clockwise rotation, in positive degrees (the value must be between 0 and 359). + A value of 0 represents straight characters. + """ + return self.angle + def get_box_orientation(self): width = self.x2 - self.x1 length = self.y2 - self.y1 @@ -113,6 +122,7 @@ def __init__(self, output_path=None, metadata=None): self.create_language_layer = True self.ocr_confidence_threshold = ANNOTATION_MINIMAL_CONFIDENCE self.language_annotation_min_len = ANNOTATION_MINIMAL_LEN + self.remove_rotated_boxes = True def text_preprocess(self, text): return text @@ -280,6 +290,7 @@ def has_space_after(self, cur_bbox, next_bbox, avg_char_width): next_bbox.x1, cur_bbox.y1, # the y coordinates are kind of arbitrary cur_bbox.y2, + angle=0, text=" ", confidence=None, language=None @@ -609,6 +620,7 @@ def create_opf(self, data_provider, pecha_id=None, opf_options = {}, ocr_import_ self.data_provider = data_provider self.remove_non_character_lines = opf_options["remove_non_character_lines"] if "remove_non_character_lines" in opf_options else True + self.remove_rotated_boxes = opf_options["remove_rotated_boxes"] if "remove_rotated_boxes" in opf_options else True self.create_language_layer = opf_options["create_language_layer"] if "create_language_layer" in opf_options else True self.ocr_confidence_threshold = opf_options["ocr_confidence_threshold"] if "ocr_confidence_threshold" in opf_options else ANNOTATION_MINIMAL_CONFIDENCE self.language_annotation_min_len = opf_options["language_annotation_min_len"] if "language_annotation_min_len" in opf_options else ANNOTATION_MINIMAL_LEN diff --git a/tests/formatters/google_vision/data/opf_expected_datas/expected_base_text.txt b/tests/formatters/google_vision/data/opf_expected_datas/expected_base_text.txt index 96f86610..a256da7e 100644 --- a/tests/formatters/google_vision/data/opf_expected_datas/expected_base_text.txt +++ b/tests/formatters/google_vision/data/opf_expected_datas/expected_base_text.txt @@ -39,7 +39,7 @@ Personal Name: :. ! kiin . I . -.. -. . . 11; : i Main Title: Lam 'bras g'zun Rdo rje'i tshig rkan gi mam 'grel bcu gcig : commentaries of Sa-chen Kun-dga'-snin -po (1092-1158) on the root text of the Lam 'bras system of teachings written at the hehest of various disciples. -Uniform Title: || : THAN 404 R I VE +Uniform Title: || : THAN 404 VE Published /Created: Dehradun. U .P. : Sakya Centre, 1985. Related Titles: Lam 'bras rnam 'grel bcu gcig. Description : 3 v. ; 9 x 45 cm . @@ -47,8 +47,8 @@ Notes: In Tibetan; pref. in English. Title on boards: Lam 'bras mam 'grel bcu geig. "Reproduced from a set of prints from the Sde-dge Dgon-chen blocks." Subjects: \ || \ . Rubricilig kun. -11111- Tit S hy -pl -Series: Nilai 10 horas likum s ; S. LS +11111- Tit hy -pl +Series: Nilai 10 horas likum ; S. LS LC Classification : BQ7672.4 .823 19856 Overseas Acg. No.: I Tib 2906 CALL NUMBER : 107671. S.: 1983 Lil: From ace6d02cf1fdaa8d4929665f84b5eb161e93a7a0 Mon Sep 17 00:00:00 2001 From: eroux Date: Thu, 3 Nov 2022 14:05:40 +0100 Subject: [PATCH 2/7] oops --- openpecha/formatters/ocr/hocr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/openpecha/formatters/ocr/hocr.py b/openpecha/formatters/ocr/hocr.py index b00cfd1a..2656685c 100644 --- a/openpecha/formatters/ocr/hocr.py +++ b/openpecha/formatters/ocr/hocr.py @@ -205,6 +205,7 @@ def parse_box(self, line_box, word_box): for boxinfo in boxinfos: boxinfo_parts = boxinfo.split(" ") if boxinfo_parts[0] == "bbox": + # in HOCR's, bbox order is x0, y0, x1, y1 coords = [ int(boxinfo_parts[1]), int(boxinfo_parts[2]), @@ -221,7 +222,8 @@ def parse_box(self, line_box, word_box): return None language = self.get_main_language_code(word_box.text) text = self.get_word_text_with_space(line_text, word_box) - box = BBox(coords[0], coords[1], coords[2], coords[3], + # but we initialize as x1, x2, y1, y2 + box = BBox(coords[0], coords[2], coords[1], coords[3], angle = angle, text=text, confidence=confidence, From 7e3425dddb1e7e13a3cc48a2ab5d12b71686383f Mon Sep 17 00:00:00 2001 From: eroux Date: Thu, 3 Nov 2022 16:21:38 +0100 Subject: [PATCH 3/7] new test file --- openpecha/formatters/ocr/hocr.py | 2 +- .../google_vision/test_bbox_orientation.py | 131 ++++++++++++++++++ 2 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 tests/formatters/google_vision/test_bbox_orientation.py diff --git a/openpecha/formatters/ocr/hocr.py b/openpecha/formatters/ocr/hocr.py index 2656685c..f135d5eb 100644 --- a/openpecha/formatters/ocr/hocr.py +++ b/openpecha/formatters/ocr/hocr.py @@ -217,7 +217,7 @@ def parse_box(self, line_box, word_box): # we need to convert it to our internal value system: angle = 360 - int(boxinfo_parts[1]) if boxinfo_parts[0] == "x_wconf": - confidence = float(boxinfo_parts[1]) + confidence = float(boxinfo_parts[1]) / 100.0 if coords is None: return None language = self.get_main_language_code(word_box.text) diff --git a/tests/formatters/google_vision/test_bbox_orientation.py b/tests/formatters/google_vision/test_bbox_orientation.py new file mode 100644 index 00000000..630437e9 --- /dev/null +++ b/tests/formatters/google_vision/test_bbox_orientation.py @@ -0,0 +1,131 @@ +from openpecha.formatters.ocr import GoogleVisionFormatter + +# 43 +# 12 + +test_0 = [ + { + "x": 816, + "y": 388 + }, + { + "x": 3635, + "y": 388 + }, + { + "x": 3635, + "y": 672 + }, + { + "x": 816, + "y": 672 + } +] + +expected_0 = [816,3635,388,672,0] + +# 32 +# 41 + +test_90 = [ + { + "x": 880, + "y": 388 + }, + { + "x": 880, + "y": 411 + }, + { + "x": 817, + "y": 411 + }, + { + "x": 817, + "y": 388 + } +] + +expected_90 = [817,880,388,411,90] + +# in this attested example, the polygon is not a rectangle +test_90_2 = [ + { + "x": 842, + "y": 304 + }, + { + "x": 843, + "y": 377 + }, + { + "x": 758, + "y": 378 + }, + { + "x": 757, + "y": 305 + } +] + +expected_90_2 = [757,843,304,378,90] + +# not attested yet, guess is + +# 21 +# 34 + +test_180 = [ + { + "x": 884, + "y": 672 + }, + { + "x": 832, + "y": 672 + }, + { + "x": 832, + "y": 611 + }, + { + "x": 884, + "y": 611 + } +] + +expected_180 = [832,884,611,672,180] + +# not attested yet, guess is + +# 14 +# 23 + +test_270 = [ + { + "x": 832, + "y": 672 + }, + { + "x": 832, + "y": 611 + }, + { + "x": 884, + "y": 611 + }, + { + "x": 884, + "y": 672 + } +] + +expected_270 = [832,884,611,672,270] + +def test_bbox_info(): + assert(GoogleVisionFormatter.get_bboxinfo_from_vertices(test_0) == expected_0) + assert(GoogleVisionFormatter.get_bboxinfo_from_vertices(test_90) == expected_90) + assert(GoogleVisionFormatter.get_bboxinfo_from_vertices(test_90_2) == expected_90_2) + assert(GoogleVisionFormatter.get_bboxinfo_from_vertices(test_180) == expected_180) + assert(GoogleVisionFormatter.get_bboxinfo_from_vertices(test_270) == expected_270) + From 7dc0ca018e682094c17adca9051e709fbe65b56a Mon Sep 17 00:00:00 2001 From: eroux Date: Thu, 3 Nov 2022 17:08:15 +0100 Subject: [PATCH 4/7] finish GV box orientation --- openpecha/core/pecha.py | 4 +- openpecha/formatters/ocr/google_vision.py | 18 +++---- .../opf_expected_datas/expected_Language.yml | 10 ++-- .../expected_OCRConfidence.yml | 54 +++++++++---------- .../expected_Pagination.yml | 20 +++---- .../expected_google_ocr_meta.yml | 10 ++-- tests/formatters/google_vision/test_gv.py | 2 +- 7 files changed, 58 insertions(+), 60 deletions(-) diff --git a/openpecha/core/pecha.py b/openpecha/core/pecha.py index 0caafb20..145bf407 100644 --- a/openpecha/core/pecha.py +++ b/openpecha/core/pecha.py @@ -322,7 +322,7 @@ def _read_components(self): return res def save_meta(self): - dump_yaml(json.loads(self.meta.json()), self.meta_fn) + dump_yaml(json.loads(self.meta.json(exclude_none=True)), self.meta_fn) def save_single_base(self, base_name: str, content: str = None): if not content: @@ -336,7 +336,7 @@ def save_base(self): def save_layer(self, base_name: str, layer_name: LayerEnum, layer: Layer): layer_fn = self._mkdir(self.layers_path / base_name) / f"{layer_name.value}.yml" - dump_yaml(json.loads(layer.json()), layer_fn) + dump_yaml(json.loads(layer.json(exclude_none=True)), layer_fn) return layer_fn def save_layers(self): diff --git a/openpecha/formatters/ocr/google_vision.py b/openpecha/formatters/ocr/google_vision.py index 60d817ea..2994cfe7 100644 --- a/openpecha/formatters/ocr/google_vision.py +++ b/openpecha/formatters/ocr/google_vision.py @@ -134,12 +134,7 @@ def dict_to_bbox(self, word): Returns: obj: BBox object of bounding bbox """ - text = word.get('text', '') confidence = word.get('confidence') - # the language returned by Google OCR is not particularly helpful - # language = self.get_language_code_from_gv_poly(word) - # instead we use our custom detection system - language = self.get_main_language_code(text) if 'boundingBox' not in word or 'vertices' not in word['boundingBox']: return None vertices = word['boundingBox']['vertices'] @@ -149,9 +144,7 @@ def dict_to_bbox(self, word): if self.remove_rotated_boxes and bboxinfo[4] > 0: return None return BBox(bboxinfo[0], bboxinfo[1], bboxinfo[2], bboxinfo[3], bboxinfo[4], - text=text, - confidence=confidence, - language=language) + confidence=confidence) @staticmethod def get_width_of_vertices(vertices): @@ -199,8 +192,13 @@ def get_char_base_bboxes_and_avg_width(self, response): cur_word += symbol['text'] if self.has_space_attached(symbol): cur_word += " " - bbox.text = cur_word - bboxes.append(bbox) + if cur_word: + bbox.text = cur_word + # the language returned by Google OCR is not particularly helpful + # language = self.get_language_code_from_gv_poly(word) + # instead we use our custom detection system + bbox.language = self.get_main_language_code(cur_word) + bboxes.append(bbox) avg_width = statistics.mean(widths) if widths else None logging.debug("average char width: %f", avg_width) return bboxes, avg_width diff --git a/tests/formatters/google_vision/data/opf_expected_datas/expected_Language.yml b/tests/formatters/google_vision/data/opf_expected_datas/expected_Language.yml index 9ac446cd..607ad46f 100644 --- a/tests/formatters/google_vision/data/opf_expected_datas/expected_Language.yml +++ b/tests/formatters/google_vision/data/opf_expected_datas/expected_Language.yml @@ -1,19 +1,19 @@ -id: 4be102e3a79641dc9c87e90fd67adcff +id: bfd404480d4b44569b6c39ab8238abd6 annotation_type: Language revision: '00001' annotations: - c03108b2b13f42eaa7180ccc1b51f09b: + 9d10e0235cb147c088f8d0ba6628e38b: span: start: 35 end: 160 language: en - 43e18af2e6bd40e0b9bab8fd323c2b7b: + 5962ca73b31f4b28ba162776d50eb156: span: start: 215 end: 282 language: en - 6f041b7ec7844e2782adb50017ff005b: + 465b2e651ef9488ab84ed62970ba4fc5: span: start: 306 - end: 2500 + end: 2492 language: en diff --git a/tests/formatters/google_vision/data/opf_expected_datas/expected_OCRConfidence.yml b/tests/formatters/google_vision/data/opf_expected_datas/expected_OCRConfidence.yml index 3d192606..225bfac6 100644 --- a/tests/formatters/google_vision/data/opf_expected_datas/expected_OCRConfidence.yml +++ b/tests/formatters/google_vision/data/opf_expected_datas/expected_OCRConfidence.yml @@ -1,108 +1,108 @@ -id: c390f321e2514b9799d5ae82cc7959ec +id: 21ede969308548b1bec6e64688d61457 annotation_type: OCRConfidence revision: '00001' annotations: - 5f0eb6072b314fbc9db920147fcb1800: + d76b3eba42674936a6f06c0d8e817fdf: span: start: 0 end: 34 confidence: 0.49705884079722795 - 4a9887c90cc243d9bcdaffdeeedd4ce9: + 7ff451cc69d64f07bc189197afd510d4: span: start: 52 end: 61 confidence: 0.8999999761581421 - 2a6c0ed4b6c74d9a831baa77d84c1039: + 638c21b709f94765b9099df52178d5eb: span: start: 78 end: 84 confidence: 0.6299999952316284 - a5b2d6c54d0d43fa8ecefa57748a68e2: + bd180242b48247dca7efb3f38815028c: span: start: 219 end: 220 confidence: 0.800000011920929 - 0324d79930124774979a723447316174: + b39f7e0258b04b2186fa55eea591fc19: span: start: 261 end: 262 confidence: 0.8299999833106995 - 8988e9a93c2d401a9061d0c8d74ab6ef: + ef7246d98e3d4ecbba0409089fe37caa: span: start: 267 end: 275 confidence: 0.625 - 6ba8571517544014a58c350575b7bd29: + bb9d5f29120844c696129c9a8759dc0a: span: start: 347 end: 352 confidence: 0.8799999952316284 - e9717869cce34520ad94f6cc14eb247b: + 5133962f00ae4e20b1e1c00f6f6b5ffb: span: start: 381 end: 390 confidence: 0.7200000087420145 - 7b776ea79d1f47c0a140a40e42f8926e: + 05355888c5a4462fbb9e98e3a5b512c7: span: start: 436 end: 441 confidence: 0.8600000143051147 - dcfc299bfdf942719882aa8a38cac3ee: + bd1dea4c0cb4456dbd5202c777abb869: span: start: 459 end: 462 confidence: 0.7966666420300802 - 2c87abd45dab44b28f9068b24e6bb9fe: + 6b2449171f8f43d4873ecbde43307778: span: start: 486 end: 489 confidence: 0.6433333158493042 - 615cd43341fa439a8381fdda5694d00f: + 78eaf3d98e354c3a90fbeee854f204c4: span: start: 506 end: 517 confidence: 0.8299999952316284 - 0af0be735cd348c5a93248cc706d6e68: + f789c0267ef44f8185bc6a35245cd641: span: start: 556 end: 562 confidence: 0.8550000190734863 - a9aac3d1df9e4d83932cfbc55f8cb9f2: + dd0fc522920b44cd8672fc51c74b0780: span: start: 604 end: 607 confidence: 0.8600000143051147 - 81444fa6599444bc887357791dbb41e9: + b687647be42f4d4db1ea4bc4f7275772: span: start: 641 end: 650 confidence: 0.7874999940395355 - 1be5fcff9f9f4053a557dcfd76365923: + 78fc5675d791425191c2435f48f4a1e3: span: start: 740 end: 751 confidence: 0.7029999852180481 - b1830e5821f6440384f53fce2bd2c35e: + 7daa67426abf44c9ba99056a6c2716f2: span: start: 764 end: 768 confidence: 0.7449999898672104 - 7dcbe784ae8f44fb89e216044f051a4a: + 98148ad973c943a3b6613944f0a747db: span: start: 780 - end: 2432 - confidence: 0.8303962739266993 + end: 2424 + confidence: 0.8348705917947432 nb_below_threshold: 60 - 4419aad7890a445f8c4ac2a0bc84d1cc: + 6959ba49201146d48d1a5a300721ce9b: span: - start: 2433 - end: 2501 + start: 2425 + end: 2493 confidence: 0.9507142943995339 nb_below_threshold: 61 - af5fdcc14cea4ad5ac991fcf49ea76c4: + 096c1f31a0c34c5193ccdbb9a96d4ca5: span: - start: 2502 - end: 3009 + start: 2494 + end: 3001 confidence: 0.6949019623445529 nb_below_threshold: 71 confidence_threshold: 0.9 diff --git a/tests/formatters/google_vision/data/opf_expected_datas/expected_Pagination.yml b/tests/formatters/google_vision/data/opf_expected_datas/expected_Pagination.yml index a1f7c4fb..640fa616 100644 --- a/tests/formatters/google_vision/data/opf_expected_datas/expected_Pagination.yml +++ b/tests/formatters/google_vision/data/opf_expected_datas/expected_Pagination.yml @@ -1,28 +1,28 @@ -id: 5218e004e189479d8f82f108fa0bfc62 +id: 6d3b365c858645c0a55e1f0166893b58 annotation_type: Pagination revision: '00001' annotations: - aa380a0e9259423784e94f598a18e13c: + dccc7f16a97147d5ad948878ff7333a2: span: start: 0 end: 779 imgnum: 1 reference: 38520001.tif - 30a159679b5349debbbdd6efeffd02cf: + 52c856b5413f45a8ba1bc9c9886a0c4d: span: start: 780 - end: 2432 + end: 2424 imgnum: 2 reference: 38520002.tif - 9bae877704634876b7905c7c4011c130: + 08e494ec561d4417995af6508cf49869: span: - start: 2433 - end: 2501 + start: 2425 + end: 2493 imgnum: 3 reference: 38520003.tif - bab648873653494787026545a7499ae7: + b3b3d85135d744928232b265a7ce74b4: span: - start: 2502 - end: 3009 + start: 2494 + end: 3001 imgnum: 4 reference: 38520004.tif diff --git a/tests/formatters/google_vision/data/opf_expected_datas/expected_google_ocr_meta.yml b/tests/formatters/google_vision/data/opf_expected_datas/expected_google_ocr_meta.yml index b058043c..467bd7b9 100644 --- a/tests/formatters/google_vision/data/opf_expected_datas/expected_google_ocr_meta.yml +++ b/tests/formatters/google_vision/data/opf_expected_datas/expected_google_ocr_meta.yml @@ -30,7 +30,7 @@ source_metadata: languages: - bo statistics: - ocr_word_mean_confidence_index: 0.8739654291793503 + ocr_word_mean_confidence_index: 0.8764347877029491 ocr_word_median_confidence_index: 0.9599999785423279 quality: null bases: @@ -44,7 +44,7 @@ bases: base_file: I3852.txt statistics: ocr_word_median_confidence_index: 0.949999988079071 - ocr_word_mean_confidence_index: 0.8404232011670042 + ocr_word_mean_confidence_index: 0.8434858081116097 I3853: source_metadata: id: http://purl.bdrc.io/resource/I3853 @@ -55,7 +55,7 @@ bases: base_file: I3853.txt statistics: ocr_word_median_confidence_index: 0.9800000190734863 - ocr_word_mean_confidence_index: 0.915238913924214 + ocr_word_mean_confidence_index: 0.917140417030617 I3854: source_metadata: id: http://purl.bdrc.io/resource/I3854 @@ -65,8 +65,8 @@ bases: order: 3 base_file: I3854.txt statistics: - ocr_word_median_confidence_index: 0.9599999785423279 - ocr_word_mean_confidence_index: 0.8695215366935236 + ocr_word_median_confidence_index: 0.9700000286102295 + ocr_word_mean_confidence_index: 0.8718006486893658 copyright: status: Public domain notice: Public domain diff --git a/tests/formatters/google_vision/test_gv.py b/tests/formatters/google_vision/test_gv.py index f9ee66b3..b2438025 100644 --- a/tests/formatters/google_vision/test_gv.py +++ b/tests/formatters/google_vision/test_gv.py @@ -6,7 +6,7 @@ from openpecha.core.layer import LayerEnum, Layer from openpecha.core.pecha import OpenPechaFS from openpecha.formatters.ocr import GoogleVisionFormatter -from openpecha.utils import load_yaml +from openpecha.utils import load_yaml, dump_yaml from test_gv_data_provider import GoogleVisionTestFileProvider #logging.basicConfig(level=logging.DEBUG) From 11497659d60e4ac2600641599decf64ce173249c Mon Sep 17 00:00:00 2001 From: eroux Date: Fri, 4 Nov 2022 09:08:49 +0100 Subject: [PATCH 5/7] improvements for #211 --- openpecha/core/layer.py | 4 ++++ openpecha/core/metadata.py | 14 +++++++++++++- openpecha/core/pecha.py | 7 +++---- openpecha/formatters/layers.py | 7 ++++++- openpecha/formatters/ocr/hocr.py | 2 ++ openpecha/utils.py | 9 +++++++++ setup.py | 3 +-- 7 files changed, 38 insertions(+), 8 deletions(-) diff --git a/openpecha/core/layer.py b/openpecha/core/layer.py index a54dd129..9b48452b 100644 --- a/openpecha/core/layer.py +++ b/openpecha/core/layer.py @@ -37,6 +37,10 @@ class LayerEnum(Enum): segment = "Segment" ocr_confidence = "OCRConfidence" + @classmethod + def to_yaml(cls, representer, node): + return representer.represent_data(node.value) + def _get_annotation_class(layer_name: LayerEnum): """Maps LayerEnum to Annotation class""" diff --git a/openpecha/core/metadata.py b/openpecha/core/metadata.py index b85d7313..b9e72dc1 100644 --- a/openpecha/core/metadata.py +++ b/openpecha/core/metadata.py @@ -12,17 +12,25 @@ class InitialCreationType(Enum): ebook = "ebook" input = "input" + @classmethod + def to_yaml(cls, representer, node): + return representer.represent_data(node.value) + class CopyrightStatus(Enum): UNKNOWN = "Unknown" COPYRIGHTED = "In copyright" PUBLIC_DOMAIN = "Public domain" + @classmethod + def to_yaml(cls, representer, node): + return representer.represent_data(node.value) + class Copyright(BaseModel): status: CopyrightStatus = CopyrightStatus.UNKNOWN notice: Optional[str] = "" - info_url: Optional[AnyHttpUrl] = None + info_url: Optional[str] = None class Config: extra = Extra.forbid @@ -59,6 +67,10 @@ class LicenseType(Enum): UNDER_COPYRIGHT = "under copyright" + @classmethod + def to_yaml(cls, representer, node): + return representer.represent_data(node.value) + class PechaMetadata(BaseModel): id: str = None diff --git a/openpecha/core/pecha.py b/openpecha/core/pecha.py index 145bf407..9a926fcf 100644 --- a/openpecha/core/pecha.py +++ b/openpecha/core/pecha.py @@ -1,4 +1,3 @@ -import json import shutil import warnings from collections import defaultdict @@ -322,7 +321,7 @@ def _read_components(self): return res def save_meta(self): - dump_yaml(json.loads(self.meta.json(exclude_none=True)), self.meta_fn) + dump_yaml(self.meta.dict(exclude_none=True), self.meta_fn) def save_single_base(self, base_name: str, content: str = None): if not content: @@ -336,7 +335,7 @@ def save_base(self): def save_layer(self, base_name: str, layer_name: LayerEnum, layer: Layer): layer_fn = self._mkdir(self.layers_path / base_name) / f"{layer_name.value}.yml" - dump_yaml(json.loads(layer.json(exclude_none=True)), layer_fn) + dump_yaml(layer.dict(exclude_none=True), layer_fn) return layer_fn def save_layers(self): @@ -346,7 +345,7 @@ def save_layers(self): def save_index(self): try: - dump_yaml(json.loads(self.index.json()), self.index_fn) + dump_yaml(self.index.dict(exclude_none=True), self.index_fn) except FileNotFoundError: pass diff --git a/openpecha/formatters/layers.py b/openpecha/formatters/layers.py index 3d029e11..572923d6 100644 --- a/openpecha/formatters/layers.py +++ b/openpecha/formatters/layers.py @@ -3,6 +3,7 @@ """ from collections import namedtuple +from enum import Enum __all__ = [ "Layer", @@ -30,7 +31,7 @@ ] -class AnnType: +class AnnType(Enum): book_title = "BookTitle" sub_title = "SubTitle" book_number = "BookNumber" @@ -54,6 +55,10 @@ class AnnType: durchen = "Durchen" footnote = "Footnote" + @classmethod + def to_yaml(cls, representer, node): + return representer.represent_data(node.value) + class _attr_names: # Layer diff --git a/openpecha/formatters/ocr/hocr.py b/openpecha/formatters/ocr/hocr.py index f135d5eb..8a665332 100644 --- a/openpecha/formatters/ocr/hocr.py +++ b/openpecha/formatters/ocr/hocr.py @@ -220,6 +220,8 @@ def parse_box(self, line_box, word_box): confidence = float(boxinfo_parts[1]) / 100.0 if coords is None: return None + if self.remove_rotated_boxes and angle > 0: + return None language = self.get_main_language_code(word_box.text) text = self.get_word_text_with_space(line_text, word_box) # but we initialize as x1, x2, y1, y2 diff --git a/openpecha/utils.py b/openpecha/utils.py index e810efa1..407939ef 100644 --- a/openpecha/utils.py +++ b/openpecha/utils.py @@ -14,6 +14,9 @@ from git.cmd import GitCommandError from openpecha import config, storages +from openpecha.core.layer import LayerEnum +from openpecha.formatters.layers import AnnType +from openpecha.core import metadata from openpecha.exceptions import PechaNotFound from openpecha.github_utils import create_release from openpecha.storages import GithubStorage, setup_auth_for_old_repo @@ -31,6 +34,12 @@ except (ImportError, AttributeError): yaml_dumper = yaml.SafeDumper +yaml_dumper.add_multi_representer(LayerEnum, LayerEnum.to_yaml) +yaml_dumper.add_multi_representer(AnnType, AnnType.to_yaml) +yaml_dumper.add_multi_representer(metadata.InitialCreationType, metadata.InitialCreationType.to_yaml) +yaml_dumper.add_multi_representer(metadata.CopyrightStatus, metadata.CopyrightStatus.to_yaml) +yaml_dumper.add_multi_representer(metadata.LicenseType, metadata.LicenseType.to_yaml) + def gzip_str(string_): # taken from https://gist.github.com/Garrett-R/dc6f08fc1eab63f94d2cbb89cb61c33d out = io.BytesIO() diff --git a/setup.py b/setup.py index edfc0eae..dae21797 100644 --- a/setup.py +++ b/setup.py @@ -35,8 +35,7 @@ def get_version(prop, project): "Click>=7.1.2, <9.0", "diff-match-patch==20181111", "polib==1.1.1, <2.0", - "PyYAML>=5.0.0, <6.0", - "pylibyaml>=0.1.0, <2.0", + "PyYAML>=5.0.0", "requests>=2.22.0, <3.0", "antx>=0.1.10, <2.0", "tqdm>=4.35.0, <5.0", From 3c8e02cb03daa0d1f0d5d10d01adacbf8f44c7c9 Mon Sep 17 00:00:00 2001 From: eroux Date: Fri, 4 Nov 2022 09:19:06 +0100 Subject: [PATCH 6/7] simplify, fix NPE --- openpecha/core/layer.py | 4 ---- openpecha/core/metadata.py | 12 ------------ openpecha/formatters/layers.py | 5 ----- openpecha/formatters/ocr/hocr.py | 8 ++++++-- openpecha/utils.py | 13 ++++++++----- 5 files changed, 14 insertions(+), 28 deletions(-) diff --git a/openpecha/core/layer.py b/openpecha/core/layer.py index 9b48452b..a54dd129 100644 --- a/openpecha/core/layer.py +++ b/openpecha/core/layer.py @@ -37,10 +37,6 @@ class LayerEnum(Enum): segment = "Segment" ocr_confidence = "OCRConfidence" - @classmethod - def to_yaml(cls, representer, node): - return representer.represent_data(node.value) - def _get_annotation_class(layer_name: LayerEnum): """Maps LayerEnum to Annotation class""" diff --git a/openpecha/core/metadata.py b/openpecha/core/metadata.py index b9e72dc1..df1c0d26 100644 --- a/openpecha/core/metadata.py +++ b/openpecha/core/metadata.py @@ -12,20 +12,12 @@ class InitialCreationType(Enum): ebook = "ebook" input = "input" - @classmethod - def to_yaml(cls, representer, node): - return representer.represent_data(node.value) - class CopyrightStatus(Enum): UNKNOWN = "Unknown" COPYRIGHTED = "In copyright" PUBLIC_DOMAIN = "Public domain" - @classmethod - def to_yaml(cls, representer, node): - return representer.represent_data(node.value) - class Copyright(BaseModel): status: CopyrightStatus = CopyrightStatus.UNKNOWN @@ -67,10 +59,6 @@ class LicenseType(Enum): UNDER_COPYRIGHT = "under copyright" - @classmethod - def to_yaml(cls, representer, node): - return representer.represent_data(node.value) - class PechaMetadata(BaseModel): id: str = None diff --git a/openpecha/formatters/layers.py b/openpecha/formatters/layers.py index 572923d6..a189671d 100644 --- a/openpecha/formatters/layers.py +++ b/openpecha/formatters/layers.py @@ -55,11 +55,6 @@ class AnnType(Enum): durchen = "Durchen" footnote = "Footnote" - @classmethod - def to_yaml(cls, representer, node): - return representer.represent_data(node.value) - - class _attr_names: # Layer ID = "id" # Uique id for annotation of specific Pecha or Abstract work. type: str diff --git a/openpecha/formatters/ocr/hocr.py b/openpecha/formatters/ocr/hocr.py index 8a665332..ff5007f1 100644 --- a/openpecha/formatters/ocr/hocr.py +++ b/openpecha/formatters/ocr/hocr.py @@ -249,7 +249,9 @@ def get_boxes(self, hocr_page_html): self.word_span = 0 word_boxes = line_box.find_all("span", {"class": "ocrx_word"}) for word_box in word_boxes: - bboxes.append(self.parse_box(line_box,word_box)) + bbox = self.parse_box(line_box,word_box) + if bbox is not None: + bboxes.append(bbox) return bboxes def get_boxes_for_IA(self, page_html): @@ -269,7 +271,9 @@ def get_boxes_for_IA(self, page_html): self.word_span = 0 word_boxes = line_box.find_all("span", {"class": "ocrx_word"}) for word_box in word_boxes: - bboxes.append(self.parse_box(line_box, word_box)) + bbox = self.parse_box(line_box,word_box) + if bbox is not None: + bboxes.append(bbox) return bboxes diff --git a/openpecha/utils.py b/openpecha/utils.py index 407939ef..958b1efe 100644 --- a/openpecha/utils.py +++ b/openpecha/utils.py @@ -34,11 +34,14 @@ except (ImportError, AttributeError): yaml_dumper = yaml.SafeDumper -yaml_dumper.add_multi_representer(LayerEnum, LayerEnum.to_yaml) -yaml_dumper.add_multi_representer(AnnType, AnnType.to_yaml) -yaml_dumper.add_multi_representer(metadata.InitialCreationType, metadata.InitialCreationType.to_yaml) -yaml_dumper.add_multi_representer(metadata.CopyrightStatus, metadata.CopyrightStatus.to_yaml) -yaml_dumper.add_multi_representer(metadata.LicenseType, metadata.LicenseType.to_yaml) +def simple_enum_to_yaml(representer, node): + return representer.represent_data(node.value) + +yaml_dumper.add_multi_representer(LayerEnum, simple_enum_to_yaml) +yaml_dumper.add_multi_representer(AnnType, simple_enum_to_yaml) +yaml_dumper.add_multi_representer(metadata.InitialCreationType, simple_enum_to_yaml) +yaml_dumper.add_multi_representer(metadata.CopyrightStatus, simple_enum_to_yaml) +yaml_dumper.add_multi_representer(metadata.LicenseType, simple_enum_to_yaml) def gzip_str(string_): # taken from https://gist.github.com/Garrett-R/dc6f08fc1eab63f94d2cbb89cb61c33d From 182c6749464f2db15e5b2a1ec648f09816db2e53 Mon Sep 17 00:00:00 2001 From: eroux Date: Fri, 4 Nov 2022 11:12:47 +0100 Subject: [PATCH 7/7] fix(pecha): handle box rotation --- openpecha/formatters/ocr/hocr.py | 14 +++++++------ openpecha/formatters/ocr/ocr.py | 2 +- openpecha/serializers/epub.py | 27 ++++++++++++------------ openpecha/serializers/hfml.py | 35 ++++++++++++++++---------------- openpecha/serializers/pedurma.py | 5 +++-- 5 files changed, 44 insertions(+), 39 deletions(-) diff --git a/openpecha/formatters/ocr/hocr.py b/openpecha/formatters/ocr/hocr.py index ff5007f1..3307d1fd 100644 --- a/openpecha/formatters/ocr/hocr.py +++ b/openpecha/formatters/ocr/hocr.py @@ -196,14 +196,14 @@ def parse_box(self, line_box, word_box): box : bbox for text in word_box with vertices, confidence, language """ line_text = line_box.text - if 'title' not in word_box: + if not word_box.has_attr('title'): return None boxinfos = word_box['title'].split(';') coords = None angle = None confidence = None for boxinfo in boxinfos: - boxinfo_parts = boxinfo.split(" ") + boxinfo_parts = boxinfo.strip().split(" ") if boxinfo_parts[0] == "bbox": # in HOCR's, bbox order is x0, y0, x1, y1 coords = [ @@ -212,15 +212,17 @@ def parse_box(self, line_box, word_box): int(boxinfo_parts[3]), int(boxinfo_parts[4]) ] - if boxinfo_parts[0] == "textangle": + elif boxinfo_parts[0] == "textangle": # angle is indicated counter-clockwise in hocr so # we need to convert it to our internal value system: - angle = 360 - int(boxinfo_parts[1]) - if boxinfo_parts[0] == "x_wconf": + angle = int(boxinfo_parts[1]) + if textangle != 0: + angle = 360 - angle + elif boxinfo_parts[0] == "x_wconf": confidence = float(boxinfo_parts[1]) / 100.0 if coords is None: return None - if self.remove_rotated_boxes and angle > 0: + if self.remove_rotated_boxes and angle is not None and angle > 0: return None language = self.get_main_language_code(word_box.text) text = self.get_word_text_with_space(line_text, word_box) diff --git a/openpecha/formatters/ocr/ocr.py b/openpecha/formatters/ocr/ocr.py index efeceec9..f3a58440 100644 --- a/openpecha/formatters/ocr/ocr.py +++ b/openpecha/formatters/ocr/ocr.py @@ -451,7 +451,7 @@ def build_page(self, bboxes, image_number, image_filename, state, avg_char_width if mean_page_confidence < self.ocr_confidence_threshold or nb_below_threshold > self.max_low_conf_per_page: state["low_confidence_annotations"][self.get_unique_id()] = OCRConfidence( span=Span(start=page_start_cc, end=state["base_layer_len"]), - confidence=mean_page_confidence, nb_below_threshold=nb_below_threshold) + confidence=mean_page_confidence, nb_below_threshold=nb_below_threshold if nb_below_threshold else None) else: self.merge_page_low_confidence_annotations(state["page_low_confidence_annotations"], state["low_confidence_annotations"]) state["page_low_confidence_annotations"] = [] diff --git a/openpecha/serializers/epub.py b/openpecha/serializers/epub.py index afa1c2c7..d0fe505e 100644 --- a/openpecha/serializers/epub.py +++ b/openpecha/serializers/epub.py @@ -135,45 +135,46 @@ def apply_annotation(self, base_id, ann, uuid2localid): only_start_ann = False start_payload = "(" end_payload = ")" - if ann["type"] == AnnType.correction: + ann_type = AnnType(ann["type"]) + if ann_type == AnnType.correction: start_payload = "(" end_payload = f',{ann["correction"]})' - elif ann["type"] == AnnType.peydurma: + elif ann_type == AnnType.peydurma: start_payload = "#" only_start_ann = True - elif ann["type"] == AnnType.error_candidate: + elif ann_type == AnnType.error_candidate: start_payload = "[" end_payload = "]" - elif ann["type"] == AnnType.book_title: + elif ann_type == AnnType.book_title: start_payload = Tsadra_template.book_title_SP end_payload = Tsadra_template.span_EP - elif ann["type"] == AnnType.sub_title: + elif ann_type == AnnType.sub_title: start_payload = Tsadra_template.sub_title_SP end_payload = Tsadra_template.span_EP - elif ann["type"] == AnnType.book_number: + elif ann_type == AnnType.book_number: start_payload = Tsadra_template.book_number_SP end_payload = f"{Tsadra_template.span_EP}{Tsadra_template.para_EP}" - elif ann["type"] == AnnType.author: + elif ann_type == AnnType.author: start_payload = Tsadra_template.author_SP end_payload = f"{Tsadra_template.span_EP}{Tsadra_template.para_EP}" - elif ann["type"] == AnnType.chapter: + elif ann_type == AnnType.chapter: start_payload = Tsadra_template.chapter_SP end_payload = Tsadra_template.span_EP - elif ann["type"] == AnnType.tsawa: + elif ann_type == AnnType.tsawa: css_class_name = self.get_css_class_name(ann) start_payload = self.get_tsawa_sp(css_class_name) end_payload = Tsadra_template.span_EP - elif ann["type"] == AnnType.citation: + elif ann_type == AnnType.citation: css_class_name = self.get_css_class_name(ann) start_payload = self.get_citation_sp(css_class_name) end_payload = Tsadra_template.span_EP - elif ann["type"] == AnnType.sabche: + elif ann_type == AnnType.sabche: start_payload = Tsadra_template.sabche_SP end_payload = Tsadra_template.span_EP - elif ann["type"] == AnnType.yigchung: + elif ann_type == AnnType.yigchung: start_payload = Tsadra_template.yigchung_SP end_payload = Tsadra_template.span_EP - elif ann["type"] == AnnType.footnote: + elif ann_type == AnnType.footnote: start_payload = f'{Tsadra_template.footnote_marker_SP} id="fm{ann["id"]}">' end_payload = Tsadra_template.footnote_EP diff --git a/openpecha/serializers/hfml.py b/openpecha/serializers/hfml.py index 44844683..33588adb 100644 --- a/openpecha/serializers/hfml.py +++ b/openpecha/serializers/hfml.py @@ -22,7 +22,8 @@ def apply_annotation(self, base_id, ann, uuid2localid=None): end_payload = ")" side = "ab" local_id = self.get_local_id(ann, uuid2localid) - if ann["type"] == AnnType.pagination: + ann_type = AnnType(ann["type"]) + if ann_type == AnnType.pagination: pg_idx = ann.get("page_index", "") if not pg_idx: pg_idx = ann.get("imgnum", "") @@ -46,52 +47,52 @@ def apply_annotation(self, base_id, ann, uuid2localid=None): else: start_payload += "\n" only_start_ann = True - elif ann["type"] == AnnType.topic: + elif ann_type == AnnType.topic: start_payload = f"{{{ann['work_id']}}}" only_start_ann = True - elif ann["type"] == AnnType.sub_topic: + elif ann_type == AnnType.sub_topic: start_payload = f"{{{ann['work_id']}}}" only_start_ann = True - elif ann["type"] == AnnType.correction: + elif ann_type == AnnType.correction: start_payload = f"<{local_id}" end_payload = f',{ann["correction"]}>' - elif ann["type"] == AnnType.archaic: + elif ann_type == AnnType.archaic: start_payload = f"{{{local_id}" end_payload = f',{ann["modern"]}}}' - elif ann["type"] == AnnType.peydurma: + elif ann_type == AnnType.peydurma: start_payload = f"#{local_id}" only_start_ann = True - elif ann["type"] == AnnType.error_candidate: + elif ann_type == AnnType.error_candidate: start_payload = f"[{local_id}" end_payload = "]" - elif ann["type"] == AnnType.book_title: + elif ann_type == AnnType.book_title: start_payload = f"<{local_id}k1" end_payload = ">" - elif ann["type"] == AnnType.book_number: + elif ann_type == AnnType.book_number: start_payload = f"<{local_id}k4" end_payload = ">" - elif ann["type"] == AnnType.poti_title: + elif ann_type == AnnType.poti_title: start_payload = f"<{local_id}k2" end_payload = ">" - elif ann["type"] == AnnType.author: + elif ann_type == AnnType.author: start_payload = f"<{local_id}au" end_payload = ">" - elif ann["type"] == AnnType.chapter: + elif ann_type == AnnType.chapter: start_payload = f"<{local_id}k3" end_payload = ">" - elif ann["type"] == AnnType.tsawa: + elif ann_type == AnnType.tsawa: start_payload = f"<{local_id}m" end_payload = "m>" - elif ann["type"] == AnnType.citation: + elif ann_type == AnnType.citation: start_payload = f"<{local_id}g" end_payload = "g>" - elif ann["type"] == AnnType.sabche: + elif ann_type == AnnType.sabche: start_payload = f"<{local_id}q" end_payload = "q>" - elif ann["type"] == AnnType.yigchung: + elif ann_type == AnnType.yigchung: start_payload = f"<{local_id}y" end_payload = "y>" - elif ann["type"] == AnnType.durchen: + elif ann_type == AnnType.durchen: start_payload = f"<{local_id}d" end_payload = "d>" diff --git a/openpecha/serializers/pedurma.py b/openpecha/serializers/pedurma.py index 082bf01b..d0e8a248 100644 --- a/openpecha/serializers/pedurma.py +++ b/openpecha/serializers/pedurma.py @@ -52,7 +52,8 @@ def apply_annotation(self, vol_id, ann, uuid2localid): end_payload = ")" side = "ab" local_id = self.get_local_id(ann, uuid2localid) - if ann["type"] == AnnType.pagination: + ann_type = AnnType(ann["type"]) + if ann_type == AnnType.pagination: pg_idx = ann.get("page_index", "") if not pg_idx: pg_idx = ann.get("imgnum", "") @@ -76,7 +77,7 @@ def apply_annotation(self, vol_id, ann, uuid2localid): else: start_payload += "\n" only_start_ann = True - elif ann["type"] == AnnType.pedurma_note: + elif ann_type == AnnType.pedurma_note: start_payload = "" end_payload = f'{ann["collation_note"]}'