From 7ae18dcbeb460c93e8db857b8d45ccf24e7dbe93 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 12 Oct 2023 17:14:28 +0100 Subject: [PATCH 01/33] [ADD] wildreceipt init --- doctr/datasets/wildreceipt.py | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 doctr/datasets/wildreceipt.py diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py new file mode 100644 index 0000000000..0f69288e24 --- /dev/null +++ b/doctr/datasets/wildreceipt.py @@ -0,0 +1,38 @@ +# Copyright (C) 2021-2023, Mindee. + +# This program is licensed under the Apache License 2.0. +# See LICENSE or go to for full license details. + +import glob +import json +import os +from pathlib import Path +from typing import Any, Dict, List, Tuple, Union + +import cv2 +import numpy as np +from PIL import Image +from tqdm import tqdm + +from .datasets import AbstractDataset +from .utils import convert_target_to_relative, crop_bboxes_from_image + +__all__ = ["WILDRECEIPT"] + + +class WILDRECEIPT(AbstractDataset): + + + def __init__( + self, + img_folder: str, + label_path: str, + train: bool = True, + use_polygons: bool = False, + recognition_task: bool = False, + **kwargs: Any, + ) -> None: + super().__init__( + img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs + ) + pass From f4c489522d98658d271d687178546fa8552b86a5 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 12 Oct 2023 17:15:46 +0100 Subject: [PATCH 02/33] [ADD] wildreceipt init --- doctr/datasets/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/doctr/datasets/__init__.py b/doctr/datasets/__init__.py index a9f8c841f3..89ee15f2f4 100644 --- a/doctr/datasets/__init__.py +++ b/doctr/datasets/__init__.py @@ -19,6 +19,7 @@ from .synthtext import * from .utils import * from .vocabs import * +from .wildreceipt import * if is_tf_available(): from .loader import * From a883ed0e2fae1f9dcf44491e2f367623628507b3 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 12 Oct 2023 17:39:37 +0100 Subject: [PATCH 03/33] [ADD] wildreceipt _convert_xmin_ymin --- doctr/datasets/wildreceipt.py | 73 ++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 0f69288e24..14d281f970 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -21,8 +21,28 @@ class WILDRECEIPT(AbstractDataset): + """WildReceipt is a collection of receipts. It contains, for each photo, of a list of OCRs - with bounding box, text, and class." + `_ | + >>> # NOTE: You need to download/generate the dataset from the repository. + >>> from doctr.datasets import WILDRECEIPT + >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/image_files", + >>> label_path="/path/to/wildreceipt/train.txt") + >>> img, target = train_set[0] + >>> test_set = WILDRECEIPT(train=False, img_folder="/path/to/wildreceipt/image_files", + >>> label_path="/path/to/wildreceipt/test.txt") + >>> img, target = test_set[0] + + Args: + img_folder: folder with all the images of the dataset + label_path: path to the annotations file of the dataset + train: whether the subset should be the training one + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + **kwargs: keyword arguments from `AbstractDataset`. + """ + def __init__( self, img_folder: str, @@ -35,4 +55,55 @@ def __init__( super().__init__( img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs ) - pass + # File existence check + if not os.path.exists(label_path) or not os.path.exists(img_folder): + raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}") + + tmp_root = os.path.join(self.root, 'wildreceipt/') + self.train = train + + self.data: List[Tuple[str, Dict[str, Any]]] = [] + + self.filename = "train.txt" if self.train else "test.txt" + file_path = os.path.join(tmp_root, self.filename) + # logger.debug(f'the file names: {tmp_root}') + with open(file_path, 'r') as file: + data = file.read() + # Split the text file into separate JSON strings + json_strings = data.strip().split('\n') + for json_string in json_strings: + json_data = json.loads(json_string) + file_name = json_data['file_name'] + annotations = json_data['annotations'] + _targets = [(convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label']) + for annotation in annotations if get_area(convert_xmin_ymin(annotation['box'])) >= 50] + if _targets: + box_targets, text_units, labels = zip(*_targets) + + self.data.append(( + file_name, + dict(boxes=np.asarray(box_targets, dtype=int), labels=list(labels), + text_units=list(text_units)), + )) + self.root = tmp_root + + +def extra_repr(self) -> str: + return f"train={self.train}" + +def _read_from_folder(self, path: str) -> None: + for img_path in glob.glob(os.path.join(path, "*.png")): + with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f: + self.data.append((img_path, f.read())) +def _convert_xmin_ymin(self, box) -> List: + if len(box) == 4: + return box + x1, y1, x2, y2, x3, y3, x4, y4 = box + x_min = min(x1, x2, x3, x4) + x_max = max(x1, x2, x3, x4) + y_min = min(y1, y2, y3, y4) + y_max = max(y1, y2, y3, y4) + return [x_min, y_min, x_max, y_max] + + + From ddb4d670322a733d2e5d115297428bc0a71caa76 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 12 Oct 2023 17:41:44 +0100 Subject: [PATCH 04/33] [ADD] wildreceipt _convert_xmin_ymin --- doctr/datasets/wildreceipt.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 14d281f970..d7674b670f 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -44,13 +44,13 @@ class WILDRECEIPT(AbstractDataset): """ def __init__( - self, - img_folder: str, - label_path: str, - train: bool = True, - use_polygons: bool = False, - recognition_task: bool = False, - **kwargs: Any, + self, + img_folder: str, + label_path: str, + train: bool = True, + use_polygons: bool = False, + recognition_task: bool = False, + **kwargs: Any, ) -> None: super().__init__( img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs @@ -75,8 +75,8 @@ def __init__( json_data = json.loads(json_string) file_name = json_data['file_name'] annotations = json_data['annotations'] - _targets = [(convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label']) - for annotation in annotations if get_area(convert_xmin_ymin(annotation['box'])) >= 50] + _targets = [(_convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label']) + for annotation in annotations] if _targets: box_targets, text_units, labels = zip(*_targets) @@ -91,11 +91,15 @@ def __init__( def extra_repr(self) -> str: return f"train={self.train}" + def _read_from_folder(self, path: str) -> None: for img_path in glob.glob(os.path.join(path, "*.png")): with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f: self.data.append((img_path, f.read())) -def _convert_xmin_ymin(self, box) -> List: + + +@classmethod +def _convert_xmin_ymin(box: List) -> List: if len(box) == 4: return box x1, y1, x2, y2, x3, y3, x4, y4 = box @@ -104,6 +108,3 @@ def _convert_xmin_ymin(self, box) -> List: y_min = min(y1, y2, y3, y4) y_max = max(y1, y2, y3, y4) return [x_min, y_min, x_max, y_max] - - - From 15abe0d86e6b5749fac1f52b2bf6e861b49e44b5 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 12 Oct 2023 17:45:08 +0100 Subject: [PATCH 05/33] [ADD] wildreceipt _convert_xmin_ymin --- doctr/datasets/wildreceipt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index d7674b670f..47982019b4 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -66,7 +66,7 @@ def __init__( self.filename = "train.txt" if self.train else "test.txt" file_path = os.path.join(tmp_root, self.filename) - # logger.debug(f'the file names: {tmp_root}') + with open(file_path, 'r') as file: data = file.read() # Split the text file into separate JSON strings From dcb63cbb833c237211286e842a0e61ae307887ee Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Sat, 14 Oct 2023 16:06:19 +0100 Subject: [PATCH 06/33] [ADD] wildreceipt test --- doctr/datasets/wildreceipt.py | 2 +- tests/pytorch/test_datasets_pt.py | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 47982019b4..7db3003b40 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -23,7 +23,7 @@ class WILDRECEIPT(AbstractDataset): """WildReceipt is a collection of receipts. It contains, for each photo, of a list of OCRs - with bounding box, text, and class." `_ | - + `repository `_. >>> # NOTE: You need to download/generate the dataset from the repository. >>> from doctr.datasets import WILDRECEIPT diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index 8f98e460fb..2235d0b3b0 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -551,6 +551,30 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset): _validate_dataset(ds, input_size, is_polygons=rotate) +@pytest.mark.parametrize("rotate", [True, False]) +@pytest.mark.parametrize( + "input_size, num_samples, recognition", + [ + [[512, 512], 3, False], # Actual set has 7149 train and 796 test samples + [[32, 128], 5, True], # recognition + ], +) +def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt): + ds = datasets.WILDRECEIPT( + *mock_wildreceipt, + train=True, + img_transforms=Resize(input_size), + use_polygons=rotate, + recognition_task=recognition, + ) + + assert len(ds) == num_samples - 1 # -1 because of the test set 90 / 10 split + assert repr(ds) == f"WILDRECEIPT(train={True})" + if recognition: + _validate_dataset_recognition_part(ds, input_size) + else: + _validate_dataset(ds, input_size, is_polygons=rotate) + # NOTE: following datasets are only for recognition task @@ -575,4 +599,4 @@ def test_iiithws_dataset(mock_iiithws_dataset): assert len(ds) == 4 # Actual set has 7141797 train and 793533 test samples assert repr(ds) == f"IIITHWS(train={True})" - _validate_dataset_recognition_part(ds, input_size) + _validate_dataset_recognition_part(ds, input_size) \ No newline at end of file From 87bf015f485db876b43b4a053ff16920cb8f2814 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Sat, 14 Oct 2023 17:15:55 +0100 Subject: [PATCH 07/33] [ADD] wildreceipt test --- tests/pytorch/test_datasets_pt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index 2235d0b3b0..cdd8811de5 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -555,8 +555,8 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset): @pytest.mark.parametrize( "input_size, num_samples, recognition", [ - [[512, 512], 3, False], # Actual set has 7149 train and 796 test samples - [[32, 128], 5, True], # recognition + [[512, 512], 3, False], + [[32, 128], 5, True], ], ) def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt): @@ -567,7 +567,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_ use_polygons=rotate, recognition_task=recognition, ) - + # TODO: FINISH THIS assert len(ds) == num_samples - 1 # -1 because of the test set 90 / 10 split assert repr(ds) == f"WILDRECEIPT(train={True})" if recognition: From 17c1112ab012813085ddeaea826b56624a5f5c45 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Sun, 15 Oct 2023 21:00:02 +0100 Subject: [PATCH 08/33] [UPDATE] wildreceipt use_polygon --- doctr/datasets/wildreceipt.py | 39 +++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 7db3003b40..5dcba2a5b5 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -61,7 +61,7 @@ def __init__( tmp_root = os.path.join(self.root, 'wildreceipt/') self.train = train - + np_dtype = np.float32 self.data: List[Tuple[str, Dict[str, Any]]] = [] self.filename = "train.txt" if self.train else "test.txt" @@ -71,20 +71,37 @@ def __init__( data = file.read() # Split the text file into separate JSON strings json_strings = data.strip().split('\n') + box: Union[List[float], np.ndarray] + _targets = [] for json_string in json_strings: json_data = json.loads(json_string) file_name = json_data['file_name'] annotations = json_data['annotations'] - _targets = [(_convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label']) - for annotation in annotations] - if _targets: - box_targets, text_units, labels = zip(*_targets) - - self.data.append(( - file_name, - dict(boxes=np.asarray(box_targets, dtype=int), labels=list(labels), - text_units=list(text_units)), - )) + for annotation in annotations: + coordinates = annotation['box'] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + box = np.array( + [ + [coordinates[0], coordinates[1]], + [coordinates[2], coordinates[3]], + [coordinates[4], coordinates[5]], + [coordinates[6], coordinates[7]], + ], + dtype=np_dtype + ) + else: + box = _convert_xmin_ymin(coordinates) + _targets = [(_convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label']) + for annotation in annotations] + if _targets: + box_targets, text_units, labels = zip(*_targets) + + self.data.append(( + file_name, + dict(boxes=np.asarray(box_targets, dtype=int), labels=list(labels), + text_units=list(text_units)), + )) self.root = tmp_root From f197337f4a94cfb202bee87d20bd7b05f2b93eae Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Sun, 15 Oct 2023 22:32:12 +0100 Subject: [PATCH 09/33] [UPDATE] wildreceipt img_folder --- doctr/datasets/wildreceipt.py | 77 +++++++++++++++++------------------ 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 5dcba2a5b5..4def66aa6e 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -27,10 +27,10 @@ class WILDRECEIPT(AbstractDataset): >>> # NOTE: You need to download/generate the dataset from the repository. >>> from doctr.datasets import WILDRECEIPT - >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/image_files", + >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/", >>> label_path="/path/to/wildreceipt/train.txt") >>> img, target = train_set[0] - >>> test_set = WILDRECEIPT(train=False, img_folder="/path/to/wildreceipt/image_files", + >>> test_set = WILDRECEIPT(train=False, img_folder="/path/to/wildreceipt/", >>> label_path="/path/to/wildreceipt/test.txt") >>> img, target = test_set[0] @@ -59,15 +59,12 @@ def __init__( if not os.path.exists(label_path) or not os.path.exists(img_folder): raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}") - tmp_root = os.path.join(self.root, 'wildreceipt/') + tmp_root = img_folder self.train = train np_dtype = np.float32 self.data: List[Tuple[str, Dict[str, Any]]] = [] - self.filename = "train.txt" if self.train else "test.txt" - file_path = os.path.join(tmp_root, self.filename) - - with open(file_path, 'r') as file: + with open(label_path, 'r') as file: data = file.read() # Split the text file into separate JSON strings json_strings = data.strip().split('\n') @@ -75,7 +72,7 @@ def __init__( _targets = [] for json_string in json_strings: json_data = json.loads(json_string) - file_name = json_data['file_name'] + img_path = json_data['file_name'] annotations = json_data['annotations'] for annotation in annotations: coordinates = annotation['box'] @@ -91,37 +88,37 @@ def __init__( dtype=np_dtype ) else: - box = _convert_xmin_ymin(coordinates) - _targets = [(_convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label']) - for annotation in annotations] - if _targets: - box_targets, text_units, labels = zip(*_targets) - - self.data.append(( - file_name, - dict(boxes=np.asarray(box_targets, dtype=int), labels=list(labels), - text_units=list(text_units)), - )) + box = self._convert_xmin_ymin(coordinates) + _targets.append((annotation['text'], box)) + text_targets, box_targets = zip(*_targets) + + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) + ) + for crop, label in zip(crops, list(text_targets)): + self.data.append((crop, label)) + else: + self.data.append( + (img_path, dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets))) + ) self.root = tmp_root - -def extra_repr(self) -> str: - return f"train={self.train}" - - -def _read_from_folder(self, path: str) -> None: - for img_path in glob.glob(os.path.join(path, "*.png")): - with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f: - self.data.append((img_path, f.read())) - - -@classmethod -def _convert_xmin_ymin(box: List) -> List: - if len(box) == 4: - return box - x1, y1, x2, y2, x3, y3, x4, y4 = box - x_min = min(x1, x2, x3, x4) - x_max = max(x1, x2, x3, x4) - y_min = min(y1, y2, y3, y4) - y_max = max(y1, y2, y3, y4) - return [x_min, y_min, x_max, y_max] + def extra_repr(self) -> str: + return f"train={self.train}" + + def _read_from_folder(self, path: str) -> None: + for img_path in glob.glob(os.path.join(path, "*.png")): + with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f: + self.data.append((img_path, f.read())) + + @classmethod + def _convert_xmin_ymin(box: List) -> List: + if len(box) == 4: + return box + x1, y1, x2, y2, x3, y3, x4, y4 = box + x_min = min(x1, x2, x3, x4) + x_max = max(x1, x2, x3, x4) + y_min = min(y1, y2, y3, y4) + y_max = max(y1, y2, y3, y4) + return [x_min, y_min, x_max, y_max] From 3c7ce8d71a21fb48320bb1ffb3365d219931f07a Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Wed, 18 Oct 2023 17:28:56 +0100 Subject: [PATCH 10/33] [ADD] mock_wildreceipt_dataset in conftest.py --- tests/conftest.py | 64 +++++++++++++++++++++++++++++++ tests/pytorch/test_datasets_pt.py | 2 +- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index c76e9393c7..d1091ffee9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -654,3 +654,67 @@ def mock_iiithws_dataset(tmpdir_factory, mock_image_stream): with open(fn, "wb") as f: f.write(file.getbuffer()) return str(root), str(label_file) + +@pytest.fixture(scope="session") +def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): + file = BytesIO(mock_image_stream) + root = tmpdir_factory.mktemp("datasets") + wildreceipt_root = root.mkdir("wildreceipt") + annotations_folder = wildreceipt_root + image_folder = wildreceipt_root.mkdir("image_files") + labels = { + "file_name": "image_files/Image_58/20/6aa1a16efcc7eb138bfee9ca90df704ef7498b54.jpeg", + "height": 1000, + "width": 562, + "annotations": [ + { + "box": [ + 200.0, + 16.0, + 317.0, + 16.0, + 317.0, + 0.0, + 200.0, + 0.0 + ], + "text": "", + "label": 0 + }, + { + "box": [ + 90.0, + 73.0, + 187.0, + 73.0, + 187.0, + 46.0, + 90.0, + 46.0 + ], + "text": "Server:Alex", + "label": 25 + }, + { + "box": [ + 92.0, + 95.0, + 222.0, + 95.0, + 222.0, + 67.0, + 92.0, + 67.0 + ], + "text": "Cashier:Sabrina", + "label": 25 + }] +} + + annotation_file = annotations_folder.join("train.txt") + with open(annotation_file, "w") as f: + json.dump(labels, f) + fn_i = image_folder.join(labels["file_name"]) + with open(fn_i, "wb") as f: + f.write(file.getbuffer()) + return str(image_folder), str(annotation_file) \ No newline at end of file diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index cdd8811de5..6b1636de58 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -559,7 +559,7 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset): [[32, 128], 5, True], ], ) -def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt): +def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset): ds = datasets.WILDRECEIPT( *mock_wildreceipt, train=True, From b7d8cb7096013127215a9be7c2851546e75a2fe4 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Wed, 18 Oct 2023 17:41:45 +0100 Subject: [PATCH 11/33] [BUG] mock_wildreceipt_dataset in conftest.py --- tests/conftest.py | 5 ++++- tests/pytorch/test_datasets_pt.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index d1091ffee9..852d6383c8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -662,6 +662,8 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): wildreceipt_root = root.mkdir("wildreceipt") annotations_folder = wildreceipt_root image_folder = wildreceipt_root.mkdir("image_files") + image_folder = image_folder.mkdir("Image_58") + image_folder = image_folder.mkdir("20") labels = { "file_name": "image_files/Image_58/20/6aa1a16efcc7eb138bfee9ca90df704ef7498b54.jpeg", "height": 1000, @@ -714,7 +716,8 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): annotation_file = annotations_folder.join("train.txt") with open(annotation_file, "w") as f: json.dump(labels, f) - fn_i = image_folder.join(labels["file_name"]) + fn_i = root.join(labels["file_name"]) + # FIXME: this one does not create the file with open(fn_i, "wb") as f: f.write(file.getbuffer()) return str(image_folder), str(annotation_file) \ No newline at end of file diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index 6b1636de58..e9cd6418b5 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -561,7 +561,7 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset): ) def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset): ds = datasets.WILDRECEIPT( - *mock_wildreceipt, + *mock_wildreceipt_dataset, train=True, img_transforms=Resize(input_size), use_polygons=rotate, From a1f09b054d17bda7ceae581266016b21a5ff477a Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Wed, 18 Oct 2023 18:41:52 +0100 Subject: [PATCH 12/33] [BUG] mock_wildreceipt_dataset in conftest.py --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 852d6383c8..aad1fa3e6b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -719,5 +719,5 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): fn_i = root.join(labels["file_name"]) # FIXME: this one does not create the file with open(fn_i, "wb") as f: - f.write(file.getbuffer()) + f.write(file.read()) return str(image_folder), str(annotation_file) \ No newline at end of file From e3b9bdc09063370526dafd4af345b241d1fd0d6b Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Wed, 18 Oct 2023 21:51:37 +0100 Subject: [PATCH 13/33] [BUG] mock_wildreceipt_dataset in conftest.py --- doctr/datasets/wildreceipt.py | 2 +- tests/conftest.py | 17 ++++++++++++----- tests/pytorch/test_datasets_pt.py | 4 ++-- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 4def66aa6e..95d5b693a5 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -112,7 +112,7 @@ def _read_from_folder(self, path: str) -> None: with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f: self.data.append((img_path, f.read())) - @classmethod + @staticmethod def _convert_xmin_ymin(box: List) -> List: if len(box) == 4: return box diff --git a/tests/conftest.py b/tests/conftest.py index aad1fa3e6b..efe9281f77 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import json +import os import shutil import tempfile from io import BytesIO @@ -662,10 +663,10 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): wildreceipt_root = root.mkdir("wildreceipt") annotations_folder = wildreceipt_root image_folder = wildreceipt_root.mkdir("image_files") - image_folder = image_folder.mkdir("Image_58") - image_folder = image_folder.mkdir("20") + # image_folder = image_folder.mkdir("Image_58") + # image_folder = image_folder.mkdir("20") labels = { - "file_name": "image_files/Image_58/20/6aa1a16efcc7eb138bfee9ca90df704ef7498b54.jpeg", + "file_name": "receipt_1.png", "height": 1000, "width": 562, "annotations": [ @@ -716,8 +717,14 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): annotation_file = annotations_folder.join("train.txt") with open(annotation_file, "w") as f: json.dump(labels, f) - fn_i = root.join(labels["file_name"]) + # fn_i = root.join(labels["file_name"]) + # os.makedirs(os.path.dirname(fn_i), exist_ok=True) # FIXME: this one does not create the file + file = BytesIO(mock_image_stream) + fn_i = image_folder.join(f"receipt_1.png") with open(fn_i, "wb") as f: - f.write(file.read()) + f.write(file.getbuffer()) + fn_l = annotations_folder.join(f"receipt_1.json") + with open(fn_l, "w") as f: + json.dump(labels, f) return str(image_folder), str(annotation_file) \ No newline at end of file diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index e9cd6418b5..c3209b4d24 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -555,8 +555,8 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset): @pytest.mark.parametrize( "input_size, num_samples, recognition", [ - [[512, 512], 3, False], - [[32, 128], 5, True], + [[512, 512], 2, False], + [[32, 128], 2, True], ], ) def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset): From 8c57b75f606f514547638b4d0cb3696d8b09030c Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Wed, 18 Oct 2023 21:52:07 +0100 Subject: [PATCH 14/33] [BUG] mock_wildreceipt_dataset in conftest.py --- tests/pytorch/test_datasets_pt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index c3209b4d24..15e8e8f841 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -573,6 +573,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_ if recognition: _validate_dataset_recognition_part(ds, input_size) else: + # FIXME: you have an error here _validate_dataset(ds, input_size, is_polygons=rotate) # NOTE: following datasets are only for recognition task From 630437d907cf11d2189dfe829af3d1cdbbd85298 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 19 Oct 2023 09:58:50 +0100 Subject: [PATCH 15/33] [BUG] mock_wildreceipt_dataset in conftest.py --- tests/pytorch/test_datasets_pt.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index 15e8e8f841..c457fee582 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -14,6 +14,10 @@ def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_polygons=False): # Fetch one sample img, target = ds[0] + # TODO: ADD More then one element + print(f"img, label 0 {ds[0]}") + print(f"img, label 1 {ds[1]}") + assert isinstance(img, torch.Tensor) assert img.shape == (3, *input_size) assert img.dtype == torch.float32 @@ -40,6 +44,9 @@ def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_poly pin_memory=True, collate_fn=ds.collate_fn, ) + print(f"loder is {loader}") + print(f"loder is type {type(loader)}") + print(f"next(iter(loader)) {next(iter(loader))}") images, targets = next(iter(loader)) assert isinstance(images, torch.Tensor) and images.shape == (batch_size, 3, *input_size) @@ -49,6 +56,7 @@ def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_poly def _validate_dataset_recognition_part(ds, input_size, batch_size=2): # Fetch one sample img, label = ds[0] + assert isinstance(img, torch.Tensor) assert img.shape == (3, *input_size) assert img.dtype == torch.float32 From 15804df5bb1d2407adf981454561408f06a7f75c Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Tue, 24 Oct 2023 22:31:08 +0100 Subject: [PATCH 16/33] [BUG] mock_wildreceipt_dataset in conftest.py --- doctr/datasets/wildreceipt.py | 1 + tests/conftest.py | 131 ++++++++++++++++++++-------------- 2 files changed, 79 insertions(+), 53 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 95d5b693a5..661ae4a754 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -71,6 +71,7 @@ def __init__( box: Union[List[float], np.ndarray] _targets = [] for json_string in json_strings: + # FIXME there is a bug here check it with the unit test json_data = json.loads(json_string) img_path = json_data['file_name'] annotations = json_data['annotations'] diff --git a/tests/conftest.py b/tests/conftest.py index efe9281f77..271dc64a74 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -666,65 +666,90 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): # image_folder = image_folder.mkdir("Image_58") # image_folder = image_folder.mkdir("20") labels = { - "file_name": "receipt_1.png", - "height": 1000, - "width": 562, - "annotations": [ - { - "box": [ - 200.0, - 16.0, - 317.0, - 16.0, - 317.0, - 0.0, - 200.0, - 0.0 - ], - "text": "", - "label": 0 - }, - { - "box": [ - 90.0, - 73.0, - 187.0, - 73.0, - 187.0, - 46.0, - 90.0, - 46.0 - ], - "text": "Server:Alex", - "label": 25 - }, - { - "box": [ - 92.0, - 95.0, - 222.0, - 95.0, - 222.0, - 67.0, - 92.0, - 67.0 - ], - "text": "Cashier:Sabrina", - "label": 25 - }] -} + "file_name": "receipt_0.png", + "height": 348, + "width": 348, + "annotations": [ + + { + "box": [ + 263.0, + 283.0, + 325.0, + 283.0, + 325.0, + 260.0, + 263.0, + 260.0 + ], + "text": "$55.96", + "label": 17 + }, + { + "box": [ + 274.0, + 308.0, + 326.0, + 308.0, + 326.0, + 286.0, + 274.0, + 286.0 + ], + "text": "$4.48", + "label": 19 + } + ] + } + labels2 = { + "file_name": "receipt_1.png", + "height": 1720, + "width": 856, + "annotations": [ + { + "box": [ + 511.0, + 738.0, + 527.0, + 738.0, + 527.0, + 713.0, + 511.0, + 713.0 + ], + "text": "a", + "label": 25 + }, + { + "box": [ + 386.0, + 409.0, + 599.0, + 409.0, + 599.0, + 373.0, + 386.0, + 373.0 + ], + "text": "089-46169340", + "label": 5 + } + ] + } annotation_file = annotations_folder.join("train.txt") with open(annotation_file, "w") as f: json.dump(labels, f) + json.dump(labels2, f) # fn_i = root.join(labels["file_name"]) # os.makedirs(os.path.dirname(fn_i), exist_ok=True) # FIXME: this one does not create the file file = BytesIO(mock_image_stream) - fn_i = image_folder.join(f"receipt_1.png") - with open(fn_i, "wb") as f: - f.write(file.getbuffer()) - fn_l = annotations_folder.join(f"receipt_1.json") - with open(fn_l, "w") as f: - json.dump(labels, f) + for i in range(2): + fn_i = image_folder.join(f"receipt_{i}.png") + with open(fn_i, "wb") as f: + f.write(file.getbuffer()) + # fn_l = annotations_folder.join(f"receipt_{i}.json") + # with open(fn_l, "w") as f: + # json.dump(labels, f) return str(image_folder), str(annotation_file) \ No newline at end of file From 275afa52c48aa36b2563792798236bbc7ba5378f Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Wed, 25 Oct 2023 21:55:14 +0100 Subject: [PATCH 17/33] [FIX] mock_wildreceipt_dataset labels --- tests/conftest.py | 68 ++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 271dc64a74..2d52f46bcd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -666,7 +666,7 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): # image_folder = image_folder.mkdir("Image_58") # image_folder = image_folder.mkdir("20") labels = { - "file_name": "receipt_0.png", + "file_name": "receipt_0.jpeg", "height": 348, "width": 348, "annotations": [ @@ -702,40 +702,36 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): ] } labels2 = { - "file_name": "receipt_1.png", - "height": 1720, - "width": 856, - "annotations": [ - { - "box": [ - 511.0, - 738.0, - 527.0, - 738.0, - 527.0, - 713.0, - 511.0, - 713.0 - ], - "text": "a", - "label": 25 - }, - { - "box": [ - 386.0, - 409.0, - 599.0, - 409.0, - 599.0, - 373.0, - 386.0, - 373.0 - ], - "text": "089-46169340", - "label": 5 - } - ] - } + "file_name": "receipt_1.jpeg", + "height": 348, + "width": 348, + "annotations": [ + { + "box": [ + 263.0, + 283.0, + 325.0, + 283.0, + 325 + ], + "label": 25 + }, + { + "box": [ + 386.0, + 409.0, + 599.0, + 409.0, + 599.0, + 373.0, + 386.0, + 373.0 + ], + "text": "089-46169340", + "label": 5 + } + ] + } annotation_file = annotations_folder.join("train.txt") with open(annotation_file, "w") as f: @@ -746,7 +742,7 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): # FIXME: this one does not create the file file = BytesIO(mock_image_stream) for i in range(2): - fn_i = image_folder.join(f"receipt_{i}.png") + fn_i = image_folder.join(f"receipt_{i}.jpeg") with open(fn_i, "wb") as f: f.write(file.getbuffer()) # fn_l = annotations_folder.join(f"receipt_{i}.json") From 82ed210b63c6d534b29ae58e3ec0e4049b0352b9 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Wed, 25 Oct 2023 22:19:07 +0100 Subject: [PATCH 18/33] [FIX] mock_wildreceipt_dataset labels --- tests/conftest.py | 13 ++----------- tests/pytorch/test_datasets_pt.py | 4 ++-- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 2d52f46bcd..9d422b4412 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -670,7 +670,6 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): "height": 348, "width": 348, "annotations": [ - { "box": [ 263.0, @@ -706,16 +705,6 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): "height": 348, "width": 348, "annotations": [ - { - "box": [ - 263.0, - 283.0, - 325.0, - 283.0, - 325 - ], - "label": 25 - }, { "box": [ 386.0, @@ -736,7 +725,9 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): annotation_file = annotations_folder.join("train.txt") with open(annotation_file, "w") as f: json.dump(labels, f) + f.write("\n") json.dump(labels2, f) + f.write("\n") # fn_i = root.join(labels["file_name"]) # os.makedirs(os.path.dirname(fn_i), exist_ok=True) # FIXME: this one does not create the file diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index c457fee582..8de0450920 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -564,7 +564,7 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset): "input_size, num_samples, recognition", [ [[512, 512], 2, False], - [[32, 128], 2, True], + [[32, 128], 1, True], ], ) def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset): @@ -576,7 +576,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_ recognition_task=recognition, ) # TODO: FINISH THIS - assert len(ds) == num_samples - 1 # -1 because of the test set 90 / 10 split + assert len(ds) == num_samples assert repr(ds) == f"WILDRECEIPT(train={True})" if recognition: _validate_dataset_recognition_part(ds, input_size) From 1e06371a5f745045fd1cd22292271f5d7a18e2cd Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Wed, 25 Oct 2023 22:34:14 +0100 Subject: [PATCH 19/33] [FIX] mock_wildreceipt_dataset labels --- doctr/datasets/wildreceipt.py | 22 +++++++++++++++++++++- tests/pytorch/test_datasets_pt.py | 3 ++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 661ae4a754..6a3becf3db 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -64,6 +64,19 @@ def __init__( np_dtype = np.float32 self.data: List[Tuple[str, Dict[str, Any]]] = [] + + # define folder to write IMGUR5K recognition dataset + reco_folder_name = "WILDRECEIPT_recognition_train" if self.train else "WILDRECEIPT_recognition_test" + reco_folder_name = "Poly_" + reco_folder_name if use_polygons else reco_folder_name + reco_folder_path = os.path.join(os.path.dirname(self.root), reco_folder_name) + reco_images_counter = 0 + + if recognition_task and os.path.isdir(reco_folder_path): + self._read_from_folder(reco_folder_path) + return + elif recognition_task and not os.path.isdir(reco_folder_path): + os.makedirs(reco_folder_path, exist_ok=False) + with open(label_path, 'r') as file: data = file.read() # Split the text file into separate JSON strings @@ -98,11 +111,18 @@ def __init__( img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) ) for crop, label in zip(crops, list(text_targets)): - self.data.append((crop, label)) + with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f: + f.write(label) + tmp_img = Image.fromarray(crop) + tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png")) + reco_images_counter += 1 + # self.data.append((crop, label)) else: self.data.append( (img_path, dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets))) ) + if recognition_task: + self._read_from_folder(reco_folder_path) self.root = tmp_root def extra_repr(self) -> str: diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index 8de0450920..80c5159680 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -564,7 +564,7 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset): "input_size, num_samples, recognition", [ [[512, 512], 2, False], - [[32, 128], 1, True], + [[32, 128], 5, True], ], ) def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset): @@ -576,6 +576,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_ recognition_task=recognition, ) # TODO: FINISH THIS + print(f"recognition {recognition}") assert len(ds) == num_samples assert repr(ds) == f"WILDRECEIPT(train={True})" if recognition: From a968db487bfe3c0b4dfec6db3cac82192d9d5bc6 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Wed, 25 Oct 2023 22:35:35 +0100 Subject: [PATCH 20/33] remove todos --- tests/conftest.py | 9 +-------- tests/pytorch/test_datasets_pt.py | 9 --------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 9d422b4412..56fc369c1e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -663,8 +663,7 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): wildreceipt_root = root.mkdir("wildreceipt") annotations_folder = wildreceipt_root image_folder = wildreceipt_root.mkdir("image_files") - # image_folder = image_folder.mkdir("Image_58") - # image_folder = image_folder.mkdir("20") + labels = { "file_name": "receipt_0.jpeg", "height": 348, @@ -728,15 +727,9 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): f.write("\n") json.dump(labels2, f) f.write("\n") - # fn_i = root.join(labels["file_name"]) - # os.makedirs(os.path.dirname(fn_i), exist_ok=True) - # FIXME: this one does not create the file file = BytesIO(mock_image_stream) for i in range(2): fn_i = image_folder.join(f"receipt_{i}.jpeg") with open(fn_i, "wb") as f: f.write(file.getbuffer()) - # fn_l = annotations_folder.join(f"receipt_{i}.json") - # with open(fn_l, "w") as f: - # json.dump(labels, f) return str(image_folder), str(annotation_file) \ No newline at end of file diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index 80c5159680..2168ea0e3f 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -14,9 +14,6 @@ def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_polygons=False): # Fetch one sample img, target = ds[0] - # TODO: ADD More then one element - print(f"img, label 0 {ds[0]}") - print(f"img, label 1 {ds[1]}") assert isinstance(img, torch.Tensor) assert img.shape == (3, *input_size) @@ -44,9 +41,6 @@ def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_poly pin_memory=True, collate_fn=ds.collate_fn, ) - print(f"loder is {loader}") - print(f"loder is type {type(loader)}") - print(f"next(iter(loader)) {next(iter(loader))}") images, targets = next(iter(loader)) assert isinstance(images, torch.Tensor) and images.shape == (batch_size, 3, *input_size) @@ -575,14 +569,11 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_ use_polygons=rotate, recognition_task=recognition, ) - # TODO: FINISH THIS - print(f"recognition {recognition}") assert len(ds) == num_samples assert repr(ds) == f"WILDRECEIPT(train={True})" if recognition: _validate_dataset_recognition_part(ds, input_size) else: - # FIXME: you have an error here _validate_dataset(ds, input_size, is_polygons=rotate) # NOTE: following datasets are only for recognition task From e42c71e6d3fb6b4427b180fd016a267248061957 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Wed, 25 Oct 2023 22:36:40 +0100 Subject: [PATCH 21/33] remove todos --- doctr/datasets/wildreceipt.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 6a3becf3db..7bb3c8de3e 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -84,7 +84,6 @@ def __init__( box: Union[List[float], np.ndarray] _targets = [] for json_string in json_strings: - # FIXME there is a bug here check it with the unit test json_data = json.loads(json_string) img_path = json_data['file_name'] annotations = json_data['annotations'] From 4ec3bf52cc8029663f5f0d765231ad8160577055 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 26 Oct 2023 10:24:21 +0100 Subject: [PATCH 22/33] [UPDATE] wildreceipt_image_folder --- tests/conftest.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 56fc369c1e..9439b07282 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -665,7 +665,7 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): image_folder = wildreceipt_root.mkdir("image_files") labels = { - "file_name": "receipt_0.jpeg", + "file_name": "Image_58/20/receipt_0.jpeg", "height": 348, "width": 348, "annotations": [ @@ -700,7 +700,7 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): ] } labels2 = { - "file_name": "receipt_1.jpeg", + "file_name": "Image_58/20/receipt_1.jpeg", "height": 348, "width": 348, "annotations": [ @@ -728,8 +728,10 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): json.dump(labels2, f) f.write("\n") file = BytesIO(mock_image_stream) + wildreceipt_image_folder = image_folder.mkdir("Image_58") + wildreceipt_image_folder = wildreceipt_image_folder.mkdir("20") for i in range(2): - fn_i = image_folder.join(f"receipt_{i}.jpeg") + fn_i = wildreceipt_image_folder.join(f"receipt_{i}.jpeg") with open(fn_i, "wb") as f: f.write(file.getbuffer()) return str(image_folder), str(annotation_file) \ No newline at end of file From ff4b3998003dda4d19bd75181aea097165ecae08 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 26 Oct 2023 10:43:03 +0100 Subject: [PATCH 23/33] [ADD] test_wildreceipt_dataset tf --- tests/tensorflow/test_datasets_tf.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/tensorflow/test_datasets_tf.py b/tests/tensorflow/test_datasets_tf.py index 6a89c89263..ac426e4bd0 100644 --- a/tests/tensorflow/test_datasets_tf.py +++ b/tests/tensorflow/test_datasets_tf.py @@ -535,6 +535,29 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset): _validate_dataset(ds, input_size, is_polygons=rotate) +@pytest.mark.parametrize("rotate", [True, False]) +@pytest.mark.parametrize( + "input_size, num_samples, recognition", + [ + [[512, 512], 2, False], + [[32, 128], 5, True], + ], +) +def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset): + ds = datasets.WILDRECEIPT( + *mock_wildreceipt_dataset, + train=True, + img_transforms=Resize(input_size), + use_polygons=rotate, + recognition_task=recognition, + ) + assert len(ds) == num_samples + assert repr(ds) == f"WILDRECEIPT(train={True})" + if recognition: + _validate_dataset_recognition_part(ds, input_size) + else: + _validate_dataset(ds, input_size, is_polygons=rotate) + # NOTE: following datasets are only for recognition task From 2a7d1e00db91ba39e9e0b20baaebdea15e64305f Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 26 Oct 2023 10:43:37 +0100 Subject: [PATCH 24/33] [UPDATE] WILDRECEIPT optimize imports --- doctr/datasets/wildreceipt.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 7bb3c8de3e..85359a5e8a 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -6,13 +6,10 @@ import glob import json import os -from pathlib import Path from typing import Any, Dict, List, Tuple, Union -import cv2 import numpy as np from PIL import Image -from tqdm import tqdm from .datasets import AbstractDataset from .utils import convert_target_to_relative, crop_bboxes_from_image From bffca24aad0eccdfa7eadad9dd45ec63a54d346c Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 26 Oct 2023 10:55:07 +0100 Subject: [PATCH 25/33] [FIX] WILDRECEIPT self.data --- doctr/datasets/wildreceipt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 85359a5e8a..5fdded8adf 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -6,6 +6,7 @@ import glob import json import os +from pathlib import Path from typing import Any, Dict, List, Tuple, Union import numpy as np @@ -59,8 +60,7 @@ def __init__( tmp_root = img_folder self.train = train np_dtype = np.float32 - self.data: List[Tuple[str, Dict[str, Any]]] = [] - + self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = [] # define folder to write IMGUR5K recognition dataset reco_folder_name = "WILDRECEIPT_recognition_train" if self.train else "WILDRECEIPT_recognition_test" From 954b8b02126a968cffccd62dfd872cdbcab257b0 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 26 Oct 2023 14:50:32 +0100 Subject: [PATCH 26/33] [UPDATE] save fata in RAM --- doctr/datasets/wildreceipt.py | 48 +++++++---------------------------- 1 file changed, 9 insertions(+), 39 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 5fdded8adf..96bd8e8658 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -17,13 +17,15 @@ __all__ = ["WILDRECEIPT"] +from ..utils import polygon_to_bbox + class WILDRECEIPT(AbstractDataset): - """WildReceipt is a collection of receipts. It contains, for each photo, of a list of OCRs - with bounding box, text, and class." - `_ | + """WildReceipt dataset from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" + `_ | `repository `_. - >>> # NOTE: You need to download/generate the dataset from the repository. + >>> # NOTE: You need to download the dataset from the repository. >>> from doctr.datasets import WILDRECEIPT >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/", >>> label_path="/path/to/wildreceipt/train.txt") @@ -62,17 +64,6 @@ def __init__( np_dtype = np.float32 self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = [] - # define folder to write IMGUR5K recognition dataset - reco_folder_name = "WILDRECEIPT_recognition_train" if self.train else "WILDRECEIPT_recognition_test" - reco_folder_name = "Poly_" + reco_folder_name if use_polygons else reco_folder_name - reco_folder_path = os.path.join(os.path.dirname(self.root), reco_folder_name) - reco_images_counter = 0 - - if recognition_task and os.path.isdir(reco_folder_path): - self._read_from_folder(reco_folder_path) - return - elif recognition_task and not os.path.isdir(reco_folder_path): - os.makedirs(reco_folder_path, exist_ok=False) with open(label_path, 'r') as file: data = file.read() @@ -98,7 +89,9 @@ def __init__( dtype=np_dtype ) else: - box = self._convert_xmin_ymin(coordinates) + box_targets = polygon_to_bbox( + tuple((coordinates[i], coordinates[i + 1]) for i in range(0, len(coordinates), 2))) + box = [coord for coords in box_targets for coord in coords] _targets.append((annotation['text'], box)) text_targets, box_targets = zip(*_targets) @@ -107,35 +100,12 @@ def __init__( img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) ) for crop, label in zip(crops, list(text_targets)): - with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f: - f.write(label) - tmp_img = Image.fromarray(crop) - tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png")) - reco_images_counter += 1 - # self.data.append((crop, label)) + self.data.append((crop, label)) else: self.data.append( (img_path, dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets))) ) - if recognition_task: - self._read_from_folder(reco_folder_path) self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}" - - def _read_from_folder(self, path: str) -> None: - for img_path in glob.glob(os.path.join(path, "*.png")): - with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f: - self.data.append((img_path, f.read())) - - @staticmethod - def _convert_xmin_ymin(box: List) -> List: - if len(box) == 4: - return box - x1, y1, x2, y2, x3, y3, x4, y4 = box - x_min = min(x1, x2, x3, x4) - x_max = max(x1, x2, x3, x4) - y_min = min(y1, y2, y3, y4) - y_max = max(y1, y2, y3, y4) - return [x_min, y_min, x_max, y_max] From edbcaf2ad134c2e026d1212808a47f0931227ef0 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 26 Oct 2023 15:02:15 +0100 Subject: [PATCH 27/33] [UPDATE] docs --- docs/source/index.rst | 1 + docs/source/using_doctr/using_datasets.rst | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index bad3a46420..ab3e7fb4b8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -66,6 +66,7 @@ Supported datasets * IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. * MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" `_. * IIITHWS from `"Generating Synthetic Data for Text Recognition" `_. +* WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" `_. .. toctree:: diff --git a/docs/source/using_doctr/using_datasets.rst b/docs/source/using_doctr/using_datasets.rst index 35146e576f..67fa8a6325 100644 --- a/docs/source/using_doctr/using_datasets.rst +++ b/docs/source/using_doctr/using_datasets.rst @@ -41,6 +41,8 @@ This datasets contains the information to train or validate a text detection mod +-----------------------------+---------------------------------+---------------------------------+----------------------------------+ | IMGUR5K | 7149 | 796 | Handwritten / external resources | +-----------------------------+---------------------------------+---------------------------------+----------------------------------+ +| WILDRECEIPT | 1268 | 472 | external resources | ++-----------------------------+---------------------------------+---------------------------------+----------------------------------+ .. code:: python3 @@ -84,6 +86,8 @@ This datasets contains the information to train or validate a text recognition m +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ | IIITHWS | 7141797 | 793533 | english / handwritten / external resources | +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ +| WILDRECEIPT | 1268 | 472 | english / external resources | ++-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ .. code:: python3 From e257a29b0c25deb0c37245c4459969c6258eea32 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Thu, 26 Oct 2023 18:42:24 +0100 Subject: [PATCH 28/33] [UPDATE] box wildreceipt --- doctr/datasets/wildreceipt.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 96bd8e8658..90815c649c 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -89,9 +89,8 @@ def __init__( dtype=np_dtype ) else: - box_targets = polygon_to_bbox( - tuple((coordinates[i], coordinates[i + 1]) for i in range(0, len(coordinates), 2))) - box = [coord for coords in box_targets for coord in coords] + x, y = coordinates[::2], coordinates[1::2] + box = [min(x), min(y), max(x), max(y)] _targets.append((annotation['text'], box)) text_targets, box_targets = zip(*_targets) From 2b3a5780e28ac0f84334fca8ac8b73d5695cafa0 Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Fri, 27 Oct 2023 11:08:52 +0100 Subject: [PATCH 29/33] [UPDATE] docs --- docs/source/using_doctr/using_datasets.rst | 2 +- doctr/datasets/wildreceipt.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/using_doctr/using_datasets.rst b/docs/source/using_doctr/using_datasets.rst index 67fa8a6325..52c5f7e24d 100644 --- a/docs/source/using_doctr/using_datasets.rst +++ b/docs/source/using_doctr/using_datasets.rst @@ -86,7 +86,7 @@ This datasets contains the information to train or validate a text recognition m +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ | IIITHWS | 7141797 | 793533 | english / handwritten / external resources | +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ -| WILDRECEIPT | 1268 | 472 | english / external resources | +| WILDRECEIPT | 49377 | 19598 | english / external resources | +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ .. code:: python3 diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 90815c649c..107c211b6e 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -25,7 +25,7 @@ class WILDRECEIPT(AbstractDataset): `_ | `repository `_. - >>> # NOTE: You need to download the dataset from the repository. + >>> # NOTE: You need to download the dataset first. >>> from doctr.datasets import WILDRECEIPT >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/", >>> label_path="/path/to/wildreceipt/train.txt") @@ -99,7 +99,8 @@ def __init__( img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) ) for crop, label in zip(crops, list(text_targets)): - self.data.append((crop, label)) + if not any(char in label for char in ["", "-", "*", "/", "=", "#", "@"]): + self.data.append((crop, label)) else: self.data.append( (img_path, dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets))) From c18175bde41378c8c3a31d17b718a3fa3b307c1d Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Fri, 27 Oct 2023 11:20:14 +0100 Subject: [PATCH 30/33] [UPDATE] filter empty and whitespace --- doctr/datasets/wildreceipt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 107c211b6e..5fa5464b8b 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -99,7 +99,7 @@ def __init__( img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) ) for crop, label in zip(crops, list(text_targets)): - if not any(char in label for char in ["", "-", "*", "/", "=", "#", "@"]): + if not any(char in label for char in ["", " "]): self.data.append((crop, label)) else: self.data.append( From 6c3379953495de3d974f1c606aa3c6b1eb5db0fd Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Fri, 27 Oct 2023 12:03:42 +0100 Subject: [PATCH 31/33] [UPDATE] filter empty and whitespace --- doctr/datasets/wildreceipt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 5fa5464b8b..9330e68a7c 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -99,7 +99,7 @@ def __init__( img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) ) for crop, label in zip(crops, list(text_targets)): - if not any(char in label for char in ["", " "]): + if label and " " not in label: self.data.append((crop, label)) else: self.data.append( From fcedaba939407b35456dbe7b713c7d47320be91b Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Fri, 27 Oct 2023 12:08:19 +0100 Subject: [PATCH 32/33] [FIX] format --- doctr/datasets/wildreceipt.py | 32 ++++++------- tests/conftest.py | 69 ++++++---------------------- tests/pytorch/test_datasets_pt.py | 3 +- tests/tensorflow/test_datasets_tf.py | 1 + 4 files changed, 32 insertions(+), 73 deletions(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index 9330e68a7c..de394595fa 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -3,21 +3,18 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. -import glob import json import os from pathlib import Path from typing import Any, Dict, List, Tuple, Union import numpy as np -from PIL import Image from .datasets import AbstractDataset from .utils import convert_target_to_relative, crop_bboxes_from_image __all__ = ["WILDRECEIPT"] -from ..utils import polygon_to_bbox class WILDRECEIPT(AbstractDataset): @@ -44,13 +41,13 @@ class WILDRECEIPT(AbstractDataset): """ def __init__( - self, - img_folder: str, - label_path: str, - train: bool = True, - use_polygons: bool = False, - recognition_task: bool = False, - **kwargs: Any, + self, + img_folder: str, + label_path: str, + train: bool = True, + use_polygons: bool = False, + recognition_task: bool = False, + **kwargs: Any, ) -> None: super().__init__( img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs @@ -64,19 +61,18 @@ def __init__( np_dtype = np.float32 self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = [] - - with open(label_path, 'r') as file: + with open(label_path, "r") as file: data = file.read() # Split the text file into separate JSON strings - json_strings = data.strip().split('\n') + json_strings = data.strip().split("\n") box: Union[List[float], np.ndarray] _targets = [] for json_string in json_strings: json_data = json.loads(json_string) - img_path = json_data['file_name'] - annotations = json_data['annotations'] + img_path = json_data["file_name"] + annotations = json_data["annotations"] for annotation in annotations: - coordinates = annotation['box'] + coordinates = annotation["box"] if use_polygons: # (x, y) coordinates of top left, top right, bottom right, bottom left corners box = np.array( @@ -86,12 +82,12 @@ def __init__( [coordinates[4], coordinates[5]], [coordinates[6], coordinates[7]], ], - dtype=np_dtype + dtype=np_dtype, ) else: x, y = coordinates[::2], coordinates[1::2] box = [min(x), min(y), max(x), max(y)] - _targets.append((annotation['text'], box)) + _targets.append((annotation["text"], box)) text_targets, box_targets = zip(*_targets) if recognition_task: diff --git a/tests/conftest.py b/tests/conftest.py index 9439b07282..4a18e9bb28 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,4 @@ import json -import os import shutil import tempfile from io import BytesIO @@ -656,6 +655,7 @@ def mock_iiithws_dataset(tmpdir_factory, mock_image_stream): f.write(file.getbuffer()) return str(root), str(label_file) + @pytest.fixture(scope="session") def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): file = BytesIO(mock_image_stream) @@ -665,61 +665,22 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): image_folder = wildreceipt_root.mkdir("image_files") labels = { - "file_name": "Image_58/20/receipt_0.jpeg", - "height": 348, - "width": 348, - "annotations": [ - { - "box": [ - 263.0, - 283.0, - 325.0, - 283.0, - 325.0, - 260.0, - 263.0, - 260.0 - ], - "text": "$55.96", - "label": 17 - }, - { - "box": [ - 274.0, - 308.0, - 326.0, - 308.0, - 326.0, - 286.0, - 274.0, - 286.0 - ], - "text": "$4.48", - "label": 19 - } - ] + "file_name": "Image_58/20/receipt_0.jpeg", + "height": 348, + "width": 348, + "annotations": [ + {"box": [263.0, 283.0, 325.0, 283.0, 325.0, 260.0, 263.0, 260.0], "text": "$55.96", "label": 17}, + {"box": [274.0, 308.0, 326.0, 308.0, 326.0, 286.0, 274.0, 286.0], "text": "$4.48", "label": 19}, + ], } labels2 = { - "file_name": "Image_58/20/receipt_1.jpeg", - "height": 348, - "width": 348, - "annotations": [ - { - "box": [ - 386.0, - 409.0, - 599.0, - 409.0, - 599.0, - 373.0, - 386.0, - 373.0 + "file_name": "Image_58/20/receipt_1.jpeg", + "height": 348, + "width": 348, + "annotations": [ + {"box": [386.0, 409.0, 599.0, 409.0, 599.0, 373.0, 386.0, 373.0], "text": "089-46169340", "label": 5} ], - "text": "089-46169340", - "label": 5 - } - ] - } + } annotation_file = annotations_folder.join("train.txt") with open(annotation_file, "w") as f: @@ -734,4 +695,4 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream): fn_i = wildreceipt_image_folder.join(f"receipt_{i}.jpeg") with open(fn_i, "wb") as f: f.write(file.getbuffer()) - return str(image_folder), str(annotation_file) \ No newline at end of file + return str(image_folder), str(annotation_file) diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index 2168ea0e3f..23d2a69fd1 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -576,6 +576,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_ else: _validate_dataset(ds, input_size, is_polygons=rotate) + # NOTE: following datasets are only for recognition task @@ -600,4 +601,4 @@ def test_iiithws_dataset(mock_iiithws_dataset): assert len(ds) == 4 # Actual set has 7141797 train and 793533 test samples assert repr(ds) == f"IIITHWS(train={True})" - _validate_dataset_recognition_part(ds, input_size) \ No newline at end of file + _validate_dataset_recognition_part(ds, input_size) diff --git a/tests/tensorflow/test_datasets_tf.py b/tests/tensorflow/test_datasets_tf.py index ac426e4bd0..e8121b9d38 100644 --- a/tests/tensorflow/test_datasets_tf.py +++ b/tests/tensorflow/test_datasets_tf.py @@ -558,6 +558,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_ else: _validate_dataset(ds, input_size, is_polygons=rotate) + # NOTE: following datasets are only for recognition task From 478a420561d41e2cf6aab386126d9da329eb03ab Mon Sep 17 00:00:00 2001 From: HamzaGbada Date: Fri, 27 Oct 2023 13:36:41 +0100 Subject: [PATCH 33/33] [FIX] format --- doctr/datasets/wildreceipt.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py index de394595fa..a802290110 100644 --- a/doctr/datasets/wildreceipt.py +++ b/doctr/datasets/wildreceipt.py @@ -16,7 +16,6 @@ __all__ = ["WILDRECEIPT"] - class WILDRECEIPT(AbstractDataset): """WildReceipt dataset from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" `_ |