Skip to content

Commit

Permalink
Merge pull request #214 from OpenPecha/rotation
Browse files Browse the repository at this point in the history
fix(pecha): handle box rotation
  • Loading branch information
10zinten authored Nov 7, 2022
2 parents 08a902c + 182c674 commit 552c4ad
Show file tree
Hide file tree
Showing 18 changed files with 368 additions and 133 deletions.
2 changes: 1 addition & 1 deletion openpecha/core/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class CopyrightStatus(Enum):
class Copyright(BaseModel):
status: CopyrightStatus = CopyrightStatus.UNKNOWN
notice: Optional[str] = ""
info_url: Optional[AnyHttpUrl] = None
info_url: Optional[str] = None

class Config:
extra = Extra.forbid
Expand Down
7 changes: 3 additions & 4 deletions openpecha/core/pecha.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import shutil
import warnings
from collections import defaultdict
Expand Down Expand Up @@ -322,7 +321,7 @@ def _read_components(self):
return res

def save_meta(self):
dump_yaml(json.loads(self.meta.json()), self.meta_fn)
dump_yaml(self.meta.dict(exclude_none=True), self.meta_fn)

def save_single_base(self, base_name: str, content: str = None):
if not content:
Expand All @@ -336,7 +335,7 @@ def save_base(self):

def save_layer(self, base_name: str, layer_name: LayerEnum, layer: Layer):
layer_fn = self._mkdir(self.layers_path / base_name) / f"{layer_name.value}.yml"
dump_yaml(json.loads(layer.json()), layer_fn)
dump_yaml(layer.dict(exclude_none=True), layer_fn)
return layer_fn

def save_layers(self):
Expand All @@ -346,7 +345,7 @@ def save_layers(self):

def save_index(self):
try:
dump_yaml(json.loads(self.index.json()), self.index_fn)
dump_yaml(self.index.dict(exclude_none=True), self.index_fn)
except FileNotFoundError:
pass

Expand Down
4 changes: 2 additions & 2 deletions openpecha/formatters/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

from collections import namedtuple
from enum import Enum

__all__ = [
"Layer",
Expand Down Expand Up @@ -30,7 +31,7 @@
]


class AnnType:
class AnnType(Enum):
book_title = "BookTitle"
sub_title = "SubTitle"
book_number = "BookNumber"
Expand All @@ -54,7 +55,6 @@ class AnnType:
durchen = "Durchen"
footnote = "Footnote"


class _attr_names:
# Layer
ID = "id" # Uique id for annotation of specific Pecha or Abstract work. type: str
Expand Down
103 changes: 82 additions & 21 deletions openpecha/formatters/ocr/google_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,50 @@ def get_language_code_from_gv_poly(self, gv_poly):
# English is a kind of default for our purpose
return "en"

@staticmethod
def get_bboxinfo_from_vertices(vertices):
"""
Vertices do not always have dots in the same order. The current hypothesis
is that their order represents the rotation of characters detected by
the OCR.
This is not documented on
https://cloud.google.com/vision/docs/reference/rest/v1/projects.locations.products.referenceImages#Vertex
though, so use the angle value with caution.
"""
if len(vertices) == 0:
return None
idx_smallest = -1
smallest_x = -1
smallest_y = -1
largest_x = -1
largest_y = -1
for idx, v in enumerate(vertices):
if "x" not in v or "y" not in v:
continue
smallest_x = v["x"] if smallest_x == -1 else min(v["x"], smallest_x)
smallest_y = v["y"] if smallest_y == -1 else min(v["y"], smallest_y)
largest_x = max(v["x"], largest_x)
largest_y = max(v["y"], largest_y)
# here we have to account for cases where the 4 dots don't form a rectangle
# because coordinates are shifted by 1, see test_bbox_info for some example
if abs(v["x"] - smallest_x) < 3 and abs(v["y"] - smallest_y) < 3:
idx_smallest = idx
if smallest_x == -1 or smallest_y == -1 or largest_y == -1 or largest_x == -1:
return None
angle = None
if len(vertices) == 4 and idx_smallest != -1:
angle = 0
if idx_smallest == 1:
angle = 270
if idx_smallest == 2:
angle = 180
if idx_smallest == 3:
angle = 90
return [smallest_x, largest_x, smallest_y, largest_y, angle]

def dict_to_bbox(self, word):
"""Convert bounding bbox to BBox object
Expand All @@ -90,21 +134,32 @@ def dict_to_bbox(self, word):
Returns:
obj: BBox object of bounding bbox
"""
text = word.get('text', '')
confidence = word.get('confidence')
# the language returned by Google OCR is not particularly helpful
# language = self.get_language_code_from_gv_poly(word)
# instead we use our custom detection system
language = self.get_main_language_code(text)
if 'boundingBox' not in word or 'vertices' not in word['boundingBox']:
return None
vertices = word['boundingBox']['vertices']
if len(vertices) != 4 or 'x' not in vertices[0] or 'x' not in vertices[1] or 'y' not in vertices[0] or 'y' not in vertices[2]:
bboxinfo = GoogleVisionFormatter.get_bboxinfo_from_vertices(vertices)
if bboxinfo == None:
return None
return BBox(vertices[0]['x'], vertices[1]['x'], vertices[0]['y'], vertices[2]['y'],
text=text,
confidence=confidence,
language=language)
if self.remove_rotated_boxes and bboxinfo[4] > 0:
return None
return BBox(bboxinfo[0], bboxinfo[1], bboxinfo[2], bboxinfo[3], bboxinfo[4],
confidence=confidence)

@staticmethod
def get_width_of_vertices(vertices):
if len(vertices) < 4:
return None
smallest_x = -1
largest_x = -1
for v in vertices:
if "x" not in v or "y" not in v:
continue
smallest_x = v["x"] if smallest_x == -1 else min(v["x"], smallest_x)
largest_x = max(v["x"], largest_x)
if smallest_x == -1:
return None
return largest_x - smallest_x

def get_char_base_bboxes_and_avg_width(self, response):
"""Return bounding bboxs in page response
Expand All @@ -116,29 +171,35 @@ def get_char_base_bboxes_and_avg_width(self, response):
list: list of BBox object which saves required info of a bounding bbox
"""
bboxes = []
cur_word = ""
widths = []
for page in response['fullTextAnnotation']['pages']:
for block in page['blocks']:
for paragraph in block['paragraphs']:
for word in paragraph['words']:
bbox = self.dict_to_bbox(word)
if bbox is None:
# case where we ignore the bbox for some reason
# for instance rotated text
continue
cur_word = ""
for symbol in word['symbols']:
symbolunicat = unicodedata.category(symbol['text'][0])
if symbolunicat in UNICODE_CHARCAT_FOR_WIDTH:
vertices = symbol['boundingBox']['vertices']
if len(vertices) < 2 or 'x' not in vertices[0] or 'x' not in vertices[1]:
logging.debug("symbol box with no coodinates, ignore for average width")
continue
logging.debug("consider '%s' (cat %s) for avg width", symbol['text'], symbolunicat)
widths.append(vertices[1]['x'] - vertices[0]['x'])
width = GoogleVisionFormatter.get_width_of_vertices(vertices)
if width > 0:
widths.append(width)
cur_word += symbol['text']
if self.has_space_attached(symbol):
cur_word += " "
word['text'] = cur_word
cur_word = ""
bbox = self.dict_to_bbox(word)
bboxes.append(bbox)
avg_width = statistics.mean(widths)
if cur_word:
bbox.text = cur_word
# the language returned by Google OCR is not particularly helpful
# language = self.get_language_code_from_gv_poly(word)
# instead we use our custom detection system
bbox.language = self.get_main_language_code(cur_word)
bboxes.append(bbox)
avg_width = statistics.mean(widths) if widths else None
logging.debug("average char width: %f", avg_width)
return bboxes, avg_width

Expand Down
54 changes: 36 additions & 18 deletions openpecha/formatters/ocr/hocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,6 @@ def __init__(self, mode=None, output_path=None, metadata=None):
super().__init__(output_path, metadata)
self.mode = mode
self.word_span = 0

def get_confidence(self, word_box):
confidence_info = word_box['title']
confidence = float(int(re.search(r"x_wconf (\d+)", confidence_info).group(1))/100)
return confidence


def get_word_text_with_space(self, line_text, word_box):
"""check for space after word_text using line_text, add space to word_text if found
Expand All @@ -202,19 +196,39 @@ def parse_box(self, line_box, word_box):
box : bbox for text in word_box with vertices, confidence, language
"""
line_text = line_box.text
try:
vertices_info = word_box['title'].split(';')[0]
except:
if not word_box.has_attr('title'):
return None
boxinfos = word_box['title'].split(';')
coords = None
angle = None
confidence = None
for boxinfo in boxinfos:
boxinfo_parts = boxinfo.strip().split(" ")
if boxinfo_parts[0] == "bbox":
# in HOCR's, bbox order is x0, y0, x1, y1
coords = [
int(boxinfo_parts[1]),
int(boxinfo_parts[2]),
int(boxinfo_parts[3]),
int(boxinfo_parts[4])
]
elif boxinfo_parts[0] == "textangle":
# angle is indicated counter-clockwise in hocr so
# we need to convert it to our internal value system:
angle = int(boxinfo_parts[1])
if textangle != 0:
angle = 360 - angle
elif boxinfo_parts[0] == "x_wconf":
confidence = float(boxinfo_parts[1]) / 100.0
if coords is None:
return None
if self.remove_rotated_boxes and angle is not None and angle > 0:
return None
vertices_coordinates = vertices_info.split(" ")
x1 = int(vertices_coordinates[1])
y1 = int(vertices_coordinates[2])
x2 = int(vertices_coordinates[3])
y2 = int(vertices_coordinates[4])
confidence = self.get_confidence(word_box)
language = self.get_main_language_code(word_box.text)
text = self.get_word_text_with_space(line_text, word_box)
box = BBox(x1, x2, y1, y2,
# but we initialize as x1, x2, y1, y2
box = BBox(coords[0], coords[2], coords[1], coords[3],
angle = angle,
text=text,
confidence=confidence,
language=language
Expand All @@ -237,7 +251,9 @@ def get_boxes(self, hocr_page_html):
self.word_span = 0
word_boxes = line_box.find_all("span", {"class": "ocrx_word"})
for word_box in word_boxes:
bboxes.append(self.parse_box(line_box,word_box))
bbox = self.parse_box(line_box,word_box)
if bbox is not None:
bboxes.append(bbox)
return bboxes

def get_boxes_for_IA(self, page_html):
Expand All @@ -257,7 +273,9 @@ def get_boxes_for_IA(self, page_html):
self.word_span = 0
word_boxes = line_box.find_all("span", {"class": "ocrx_word"})
for word_box in word_boxes:
bboxes.append(self.parse_box(line_box, word_box))
bbox = self.parse_box(line_box,word_box)
if bbox is not None:
bboxes.append(bbox)
return bboxes


Expand Down
16 changes: 14 additions & 2 deletions openpecha/formatters/ocr/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,13 @@
SAME_LINE_RATIO_THRESHOLD = 0.2

class BBox:
def __init__(self, x1: int, x2: int, y1: int, y2: int, text: str = None, confidence: float = None, language: str = NO_LANG):
def __init__(self, x1: int, x2: int, y1: int, y2: int, angle: int = None, text: str = None, confidence: float = None, language: str = NO_LANG):
self.text = text
self.x1 = x1
self.x2 = x2
self.y1 = y1
self.y2 = y2
self.angle = angle
self.confidence = confidence
self.language = language
self.mid_y = (y1 + y2) / 2
Expand All @@ -57,6 +58,14 @@ def get_height(self):
def get_width(self):
return self.x2 - self.x1

def get_angle(self):
"""
Returns the angle of the BBox. The value is either None (when no angle can be determined)
or the value of the clockwise rotation, in positive degrees (the value must be between 0 and 359).
A value of 0 represents straight characters.
"""
return self.angle

def get_box_orientation(self):
width = self.x2 - self.x1
length = self.y2 - self.y1
Expand Down Expand Up @@ -113,6 +122,7 @@ def __init__(self, output_path=None, metadata=None):
self.create_language_layer = True
self.ocr_confidence_threshold = ANNOTATION_MINIMAL_CONFIDENCE
self.language_annotation_min_len = ANNOTATION_MINIMAL_LEN
self.remove_rotated_boxes = True

def text_preprocess(self, text):
return text
Expand Down Expand Up @@ -280,6 +290,7 @@ def has_space_after(self, cur_bbox, next_bbox, avg_char_width):
next_bbox.x1,
cur_bbox.y1, # the y coordinates are kind of arbitrary
cur_bbox.y2,
angle=0,
text=" ",
confidence=None,
language=None
Expand Down Expand Up @@ -440,7 +451,7 @@ def build_page(self, bboxes, image_number, image_filename, state, avg_char_width
if mean_page_confidence < self.ocr_confidence_threshold or nb_below_threshold > self.max_low_conf_per_page:
state["low_confidence_annotations"][self.get_unique_id()] = OCRConfidence(
span=Span(start=page_start_cc, end=state["base_layer_len"]),
confidence=mean_page_confidence, nb_below_threshold=nb_below_threshold)
confidence=mean_page_confidence, nb_below_threshold=nb_below_threshold if nb_below_threshold else None)
else:
self.merge_page_low_confidence_annotations(state["page_low_confidence_annotations"], state["low_confidence_annotations"])
state["page_low_confidence_annotations"] = []
Expand Down Expand Up @@ -609,6 +620,7 @@ def create_opf(self, data_provider, pecha_id=None, opf_options = {}, ocr_import_
self.data_provider = data_provider

self.remove_non_character_lines = opf_options["remove_non_character_lines"] if "remove_non_character_lines" in opf_options else True
self.remove_rotated_boxes = opf_options["remove_rotated_boxes"] if "remove_rotated_boxes" in opf_options else True
self.create_language_layer = opf_options["create_language_layer"] if "create_language_layer" in opf_options else True
self.ocr_confidence_threshold = opf_options["ocr_confidence_threshold"] if "ocr_confidence_threshold" in opf_options else ANNOTATION_MINIMAL_CONFIDENCE
self.language_annotation_min_len = opf_options["language_annotation_min_len"] if "language_annotation_min_len" in opf_options else ANNOTATION_MINIMAL_LEN
Expand Down
Loading

0 comments on commit 552c4ad

Please sign in to comment.