-
Notifications
You must be signed in to change notification settings - Fork 463
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add wildreceipt dataset #1359
Add wildreceipt dataset #1359
Changes from 30 commits
7ae18dc
f4c4895
a883ed0
ddb4d67
15abe0d
dcb63cb
87bf015
17c1112
f197337
3c7ce8d
b7d8cb7
a1f09b0
e3b9bdc
8c57b75
630437d
15804df
275afa5
82ed210
1e06371
a968db4
e42c71e
4ec3bf5
ff4b399
2a7d1e0
bffca24
954b8b0
edbcaf2
e257a29
2b3a578
c18175b
6c33799
fcedaba
478a420
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -0,0 +1,111 @@ | ||||
# Copyright (C) 2021-2023, Mindee. | ||||
|
||||
# This program is licensed under the Apache License 2.0. | ||||
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details. | ||||
|
||||
import glob | ||||
Check notice on line 6 in doctr/datasets/wildreceipt.py Codacy Production / Codacy Static Code Analysisdoctr/datasets/wildreceipt.py#L6
|
||||
import json | ||||
import os | ||||
from pathlib import Path | ||||
from typing import Any, Dict, List, Tuple, Union | ||||
|
||||
import numpy as np | ||||
from PIL import Image | ||||
Check notice on line 13 in doctr/datasets/wildreceipt.py Codacy Production / Codacy Static Code Analysisdoctr/datasets/wildreceipt.py#L13
|
||||
|
||||
from .datasets import AbstractDataset | ||||
from .utils import convert_target_to_relative, crop_bboxes_from_image | ||||
|
||||
__all__ = ["WILDRECEIPT"] | ||||
|
||||
from ..utils import polygon_to_bbox | ||||
Check notice on line 20 in doctr/datasets/wildreceipt.py Codacy Production / Codacy Static Code Analysisdoctr/datasets/wildreceipt.py#L20
|
||||
|
||||
|
||||
class WILDRECEIPT(AbstractDataset): | ||||
"""WildReceipt dataset from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" | ||||
<https://arxiv.org/abs/2103.14470v1>`_ | | ||||
`repository <https://download.openmmlab.com/mmocr/data/wildreceipt.tar>`_. | ||||
|
||||
>>> # NOTE: You need to download the dataset first. | ||||
>>> from doctr.datasets import WILDRECEIPT | ||||
>>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/", | ||||
>>> label_path="/path/to/wildreceipt/train.txt") | ||||
>>> img, target = train_set[0] | ||||
>>> test_set = WILDRECEIPT(train=False, img_folder="/path/to/wildreceipt/", | ||||
>>> label_path="/path/to/wildreceipt/test.txt") | ||||
>>> img, target = test_set[0] | ||||
|
||||
Args: | ||||
img_folder: folder with all the images of the dataset | ||||
label_path: path to the annotations file of the dataset | ||||
train: whether the subset should be the training one | ||||
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) | ||||
recognition_task: whether the dataset should be used for recognition task | ||||
**kwargs: keyword arguments from `AbstractDataset`. | ||||
""" | ||||
|
||||
def __init__( | ||||
self, | ||||
img_folder: str, | ||||
label_path: str, | ||||
train: bool = True, | ||||
use_polygons: bool = False, | ||||
recognition_task: bool = False, | ||||
**kwargs: Any, | ||||
) -> None: | ||||
super().__init__( | ||||
img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs | ||||
) | ||||
# File existence check | ||||
if not os.path.exists(label_path) or not os.path.exists(img_folder): | ||||
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}") | ||||
|
||||
tmp_root = img_folder | ||||
self.train = train | ||||
np_dtype = np.float32 | ||||
self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = [] | ||||
|
||||
|
||||
with open(label_path, 'r') as file: | ||||
data = file.read() | ||||
# Split the text file into separate JSON strings | ||||
json_strings = data.strip().split('\n') | ||||
box: Union[List[float], np.ndarray] | ||||
_targets = [] | ||||
for json_string in json_strings: | ||||
json_data = json.loads(json_string) | ||||
img_path = json_data['file_name'] | ||||
annotations = json_data['annotations'] | ||||
for annotation in annotations: | ||||
coordinates = annotation['box'] | ||||
if use_polygons: | ||||
# (x, y) coordinates of top left, top right, bottom right, bottom left corners | ||||
box = np.array( | ||||
[ | ||||
[coordinates[0], coordinates[1]], | ||||
[coordinates[2], coordinates[3]], | ||||
[coordinates[4], coordinates[5]], | ||||
[coordinates[6], coordinates[7]], | ||||
], | ||||
dtype=np_dtype | ||||
) | ||||
else: | ||||
x, y = coordinates[::2], coordinates[1::2] | ||||
box = [min(x), min(y), max(x), max(y)] | ||||
_targets.append((annotation['text'], box)) | ||||
text_targets, box_targets = zip(*_targets) | ||||
|
||||
if recognition_task: | ||||
crops = crop_bboxes_from_image( | ||||
img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) | ||||
) | ||||
for crop, label in zip(crops, list(text_targets)): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know if there are text inside we need to filter out ? Ref.: Line 100 in f22f6dd
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's worth noting that this dataset contains small text elements that might not be conducive to the recognition task. For instance, we could consider filtering out text elements that are empty or consist of characters such as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @HamzaGbada |
||||
if not any(char in label for char in ["", " "]): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||
self.data.append((crop, label)) | ||||
else: | ||||
self.data.append( | ||||
(img_path, dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets))) | ||||
) | ||||
self.root = tmp_root | ||||
|
||||
def extra_repr(self) -> str: | ||||
return f"train={self.train}" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Optional
If we have an image to give a general overview of the dataset would be great
See:
https://mindee.github.io/doctr/modules/datasets.html
doctr/doctr/datasets/funsd.py
Line 24 in f22f6dd
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
where should I put the image ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@HamzaGbada you can post it here
@odulcy-mindee Could you upload it please ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@felixT2K @HamzaGbada Here you go: