From 7ae18dcbeb460c93e8db857b8d45ccf24e7dbe93 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 12 Oct 2023 17:14:28 +0100
Subject: [PATCH 01/33] [ADD] wildreceipt init

---
 doctr/datasets/wildreceipt.py | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 doctr/datasets/wildreceipt.py

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
new file mode 100644
index 0000000000..0f69288e24
--- /dev/null
+++ b/doctr/datasets/wildreceipt.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2021-2023, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+
+import glob
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union
+
+import cv2
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+
+from .datasets import AbstractDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+
+__all__ = ["WILDRECEIPT"]
+
+
+class WILDRECEIPT(AbstractDataset):
+
+
+    def __init__(
+        self,
+        img_folder: str,
+        label_path: str,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
+        )
+        pass

From f4c489522d98658d271d687178546fa8552b86a5 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 12 Oct 2023 17:15:46 +0100
Subject: [PATCH 02/33] [ADD] wildreceipt init

---
 doctr/datasets/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doctr/datasets/__init__.py b/doctr/datasets/__init__.py
index a9f8c841f3..89ee15f2f4 100644
--- a/doctr/datasets/__init__.py
+++ b/doctr/datasets/__init__.py
@@ -19,6 +19,7 @@
 from .synthtext import *
 from .utils import *
 from .vocabs import *
+from .wildreceipt import *
 
 if is_tf_available():
     from .loader import *

From a883ed0e2fae1f9dcf44491e2f367623628507b3 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 12 Oct 2023 17:39:37 +0100
Subject: [PATCH 03/33] [ADD] wildreceipt _convert_xmin_ymin

---
 doctr/datasets/wildreceipt.py | 73 ++++++++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 1 deletion(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 0f69288e24..14d281f970 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -21,8 +21,28 @@
 
 
 class WILDRECEIPT(AbstractDataset):
+    """WildReceipt is a collection of receipts. It contains, for each photo, of a list of OCRs - with bounding box, text, and class."
+    <https://arxiv.org/abs/2103.14470v1>`_ |
 
 
+    >>> # NOTE: You need to download/generate the dataset from the repository.
+    >>> from doctr.datasets import WILDRECEIPT
+    >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/image_files",
+    >>>                     label_path="/path/to/wildreceipt/train.txt")
+    >>> img, target = train_set[0]
+    >>> test_set = WILDRECEIPT(train=False, img_folder="/path/to/wildreceipt/image_files",
+    >>>                    label_path="/path/to/wildreceipt/test.txt")
+    >>> img, target = test_set[0]
+
+    Args:
+        img_folder: folder with all the images of the dataset
+        label_path: path to the annotations file of the dataset
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+
     def __init__(
         self,
         img_folder: str,
@@ -35,4 +55,55 @@ def __init__(
         super().__init__(
             img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
         )
-        pass
+        # File existence check
+        if not os.path.exists(label_path) or not os.path.exists(img_folder):
+            raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
+
+        tmp_root = os.path.join(self.root, 'wildreceipt/')
+        self.train = train
+
+        self.data: List[Tuple[str, Dict[str, Any]]] = []
+
+        self.filename = "train.txt" if self.train else "test.txt"
+        file_path = os.path.join(tmp_root, self.filename)
+        # logger.debug(f'the file names: {tmp_root}')
+        with open(file_path, 'r') as file:
+            data = file.read()
+        # Split the text file into separate JSON strings
+        json_strings = data.strip().split('\n')
+        for json_string in json_strings:
+            json_data = json.loads(json_string)
+            file_name = json_data['file_name']
+            annotations = json_data['annotations']
+            _targets = [(convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label'])
+                        for annotation in annotations if get_area(convert_xmin_ymin(annotation['box'])) >= 50]
+            if _targets:
+                box_targets, text_units, labels = zip(*_targets)
+
+                self.data.append((
+                    file_name,
+                    dict(boxes=np.asarray(box_targets, dtype=int), labels=list(labels),
+                         text_units=list(text_units)),
+                ))
+        self.root = tmp_root
+
+
+def extra_repr(self) -> str:
+    return f"train={self.train}"
+
+def _read_from_folder(self, path: str) -> None:
+    for img_path in glob.glob(os.path.join(path, "*.png")):
+        with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
+            self.data.append((img_path, f.read()))
+def _convert_xmin_ymin(self, box) -> List:
+    if len(box) == 4:
+        return box
+    x1, y1, x2, y2, x3, y3, x4, y4 = box
+    x_min = min(x1, x2, x3, x4)
+    x_max = max(x1, x2, x3, x4)
+    y_min = min(y1, y2, y3, y4)
+    y_max = max(y1, y2, y3, y4)
+    return [x_min, y_min, x_max, y_max]
+
+
+

From ddb4d670322a733d2e5d115297428bc0a71caa76 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 12 Oct 2023 17:41:44 +0100
Subject: [PATCH 04/33] [ADD] wildreceipt _convert_xmin_ymin

---
 doctr/datasets/wildreceipt.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 14d281f970..d7674b670f 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -44,13 +44,13 @@ class WILDRECEIPT(AbstractDataset):
     """
 
     def __init__(
-        self,
-        img_folder: str,
-        label_path: str,
-        train: bool = True,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
+            self,
+            img_folder: str,
+            label_path: str,
+            train: bool = True,
+            use_polygons: bool = False,
+            recognition_task: bool = False,
+            **kwargs: Any,
     ) -> None:
         super().__init__(
             img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
@@ -75,8 +75,8 @@ def __init__(
             json_data = json.loads(json_string)
             file_name = json_data['file_name']
             annotations = json_data['annotations']
-            _targets = [(convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label'])
-                        for annotation in annotations if get_area(convert_xmin_ymin(annotation['box'])) >= 50]
+            _targets = [(_convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label'])
+                        for annotation in annotations]
             if _targets:
                 box_targets, text_units, labels = zip(*_targets)
 
@@ -91,11 +91,15 @@ def __init__(
 def extra_repr(self) -> str:
     return f"train={self.train}"
 
+
 def _read_from_folder(self, path: str) -> None:
     for img_path in glob.glob(os.path.join(path, "*.png")):
         with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
             self.data.append((img_path, f.read()))
-def _convert_xmin_ymin(self, box) -> List:
+
+
+@classmethod
+def _convert_xmin_ymin(box: List) -> List:
     if len(box) == 4:
         return box
     x1, y1, x2, y2, x3, y3, x4, y4 = box
@@ -104,6 +108,3 @@ def _convert_xmin_ymin(self, box) -> List:
     y_min = min(y1, y2, y3, y4)
     y_max = max(y1, y2, y3, y4)
     return [x_min, y_min, x_max, y_max]
-
-
-

From 15abe0d86e6b5749fac1f52b2bf6e861b49e44b5 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 12 Oct 2023 17:45:08 +0100
Subject: [PATCH 05/33] [ADD] wildreceipt _convert_xmin_ymin

---
 doctr/datasets/wildreceipt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index d7674b670f..47982019b4 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -66,7 +66,7 @@ def __init__(
 
         self.filename = "train.txt" if self.train else "test.txt"
         file_path = os.path.join(tmp_root, self.filename)
-        # logger.debug(f'the file names: {tmp_root}')
+
         with open(file_path, 'r') as file:
             data = file.read()
         # Split the text file into separate JSON strings

From dcb63cbb833c237211286e842a0e61ae307887ee Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Sat, 14 Oct 2023 16:06:19 +0100
Subject: [PATCH 06/33] [ADD] wildreceipt test

---
 doctr/datasets/wildreceipt.py     |  2 +-
 tests/pytorch/test_datasets_pt.py | 26 +++++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 47982019b4..7db3003b40 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -23,7 +23,7 @@
 class WILDRECEIPT(AbstractDataset):
     """WildReceipt is a collection of receipts. It contains, for each photo, of a list of OCRs - with bounding box, text, and class."
     <https://arxiv.org/abs/2103.14470v1>`_ |
-
+    `repository <https://download.openmmlab.com/mmocr/data/wildreceipt.tar>`_.
 
     >>> # NOTE: You need to download/generate the dataset from the repository.
     >>> from doctr.datasets import WILDRECEIPT
diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
index 8f98e460fb..2235d0b3b0 100644
--- a/tests/pytorch/test_datasets_pt.py
+++ b/tests/pytorch/test_datasets_pt.py
@@ -551,6 +551,30 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset):
         _validate_dataset(ds, input_size, is_polygons=rotate)
 
 
+@pytest.mark.parametrize("rotate", [True, False])
+@pytest.mark.parametrize(
+    "input_size, num_samples, recognition",
+    [
+        [[512, 512], 3, False],  # Actual set has 7149 train and 796 test samples
+        [[32, 128], 5, True],  # recognition
+    ],
+)
+def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt):
+    ds = datasets.WILDRECEIPT(
+        *mock_wildreceipt,
+        train=True,
+        img_transforms=Resize(input_size),
+        use_polygons=rotate,
+        recognition_task=recognition,
+    )
+
+    assert len(ds) == num_samples - 1  # -1 because of the test set 90 / 10 split
+    assert repr(ds) == f"WILDRECEIPT(train={True})"
+    if recognition:
+        _validate_dataset_recognition_part(ds, input_size)
+    else:
+        _validate_dataset(ds, input_size, is_polygons=rotate)
+
 # NOTE: following datasets are only for recognition task
 
 
@@ -575,4 +599,4 @@ def test_iiithws_dataset(mock_iiithws_dataset):
 
     assert len(ds) == 4  # Actual set has 7141797 train and 793533 test samples
     assert repr(ds) == f"IIITHWS(train={True})"
-    _validate_dataset_recognition_part(ds, input_size)
+    _validate_dataset_recognition_part(ds, input_size)
\ No newline at end of file

From 87bf015f485db876b43b4a053ff16920cb8f2814 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Sat, 14 Oct 2023 17:15:55 +0100
Subject: [PATCH 07/33] [ADD] wildreceipt test

---
 tests/pytorch/test_datasets_pt.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
index 2235d0b3b0..cdd8811de5 100644
--- a/tests/pytorch/test_datasets_pt.py
+++ b/tests/pytorch/test_datasets_pt.py
@@ -555,8 +555,8 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset):
 @pytest.mark.parametrize(
     "input_size, num_samples, recognition",
     [
-        [[512, 512], 3, False],  # Actual set has 7149 train and 796 test samples
-        [[32, 128], 5, True],  # recognition
+        [[512, 512], 3, False],
+        [[32, 128], 5, True],
     ],
 )
 def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt):
@@ -567,7 +567,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_
         use_polygons=rotate,
         recognition_task=recognition,
     )
-
+    # TODO: FINISH THIS
     assert len(ds) == num_samples - 1  # -1 because of the test set 90 / 10 split
     assert repr(ds) == f"WILDRECEIPT(train={True})"
     if recognition:

From 17c1112ab012813085ddeaea826b56624a5f5c45 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Sun, 15 Oct 2023 21:00:02 +0100
Subject: [PATCH 08/33] [UPDATE] wildreceipt use_polygon

---
 doctr/datasets/wildreceipt.py | 39 +++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 7db3003b40..5dcba2a5b5 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -61,7 +61,7 @@ def __init__(
 
         tmp_root = os.path.join(self.root, 'wildreceipt/')
         self.train = train
-
+        np_dtype = np.float32
         self.data: List[Tuple[str, Dict[str, Any]]] = []
 
         self.filename = "train.txt" if self.train else "test.txt"
@@ -71,20 +71,37 @@ def __init__(
             data = file.read()
         # Split the text file into separate JSON strings
         json_strings = data.strip().split('\n')
+        box: Union[List[float], np.ndarray]
+        _targets = []
         for json_string in json_strings:
             json_data = json.loads(json_string)
             file_name = json_data['file_name']
             annotations = json_data['annotations']
-            _targets = [(_convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label'])
-                        for annotation in annotations]
-            if _targets:
-                box_targets, text_units, labels = zip(*_targets)
-
-                self.data.append((
-                    file_name,
-                    dict(boxes=np.asarray(box_targets, dtype=int), labels=list(labels),
-                         text_units=list(text_units)),
-                ))
+            for annotation in annotations:
+                coordinates = annotation['box']
+                if use_polygons:
+                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                    box = np.array(
+                        [
+                            [coordinates[0], coordinates[1]],
+                            [coordinates[2], coordinates[3]],
+                            [coordinates[4], coordinates[5]],
+                            [coordinates[6], coordinates[7]],
+                        ],
+                        dtype=np_dtype
+                    )
+                else:
+                    box = _convert_xmin_ymin(coordinates)
+                _targets = [(_convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label'])
+                            for annotation in annotations]
+                if _targets:
+                    box_targets, text_units, labels = zip(*_targets)
+
+                    self.data.append((
+                        file_name,
+                        dict(boxes=np.asarray(box_targets, dtype=int), labels=list(labels),
+                             text_units=list(text_units)),
+                    ))
         self.root = tmp_root
 
 

From f197337f4a94cfb202bee87d20bd7b05f2b93eae Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Sun, 15 Oct 2023 22:32:12 +0100
Subject: [PATCH 09/33] [UPDATE] wildreceipt img_folder

---
 doctr/datasets/wildreceipt.py | 77 +++++++++++++++++------------------
 1 file changed, 37 insertions(+), 40 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 5dcba2a5b5..4def66aa6e 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -27,10 +27,10 @@ class WILDRECEIPT(AbstractDataset):
 
     >>> # NOTE: You need to download/generate the dataset from the repository.
     >>> from doctr.datasets import WILDRECEIPT
-    >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/image_files",
+    >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/",
     >>>                     label_path="/path/to/wildreceipt/train.txt")
     >>> img, target = train_set[0]
-    >>> test_set = WILDRECEIPT(train=False, img_folder="/path/to/wildreceipt/image_files",
+    >>> test_set = WILDRECEIPT(train=False, img_folder="/path/to/wildreceipt/",
     >>>                    label_path="/path/to/wildreceipt/test.txt")
     >>> img, target = test_set[0]
 
@@ -59,15 +59,12 @@ def __init__(
         if not os.path.exists(label_path) or not os.path.exists(img_folder):
             raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
 
-        tmp_root = os.path.join(self.root, 'wildreceipt/')
+        tmp_root = img_folder
         self.train = train
         np_dtype = np.float32
         self.data: List[Tuple[str, Dict[str, Any]]] = []
 
-        self.filename = "train.txt" if self.train else "test.txt"
-        file_path = os.path.join(tmp_root, self.filename)
-
-        with open(file_path, 'r') as file:
+        with open(label_path, 'r') as file:
             data = file.read()
         # Split the text file into separate JSON strings
         json_strings = data.strip().split('\n')
@@ -75,7 +72,7 @@ def __init__(
         _targets = []
         for json_string in json_strings:
             json_data = json.loads(json_string)
-            file_name = json_data['file_name']
+            img_path = json_data['file_name']
             annotations = json_data['annotations']
             for annotation in annotations:
                 coordinates = annotation['box']
@@ -91,37 +88,37 @@ def __init__(
                         dtype=np_dtype
                     )
                 else:
-                    box = _convert_xmin_ymin(coordinates)
-                _targets = [(_convert_xmin_ymin(annotation['box']), annotation['text'].lower(), annotation['label'])
-                            for annotation in annotations]
-                if _targets:
-                    box_targets, text_units, labels = zip(*_targets)
-
-                    self.data.append((
-                        file_name,
-                        dict(boxes=np.asarray(box_targets, dtype=int), labels=list(labels),
-                             text_units=list(text_units)),
-                    ))
+                    box = self._convert_xmin_ymin(coordinates)
+                _targets.append((annotation['text'], box))
+            text_targets, box_targets = zip(*_targets)
+
+            if recognition_task:
+                crops = crop_bboxes_from_image(
+                    img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
+                )
+                for crop, label in zip(crops, list(text_targets)):
+                    self.data.append((crop, label))
+            else:
+                self.data.append(
+                    (img_path, dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)))
+                )
         self.root = tmp_root
 
-
-def extra_repr(self) -> str:
-    return f"train={self.train}"
-
-
-def _read_from_folder(self, path: str) -> None:
-    for img_path in glob.glob(os.path.join(path, "*.png")):
-        with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
-            self.data.append((img_path, f.read()))
-
-
-@classmethod
-def _convert_xmin_ymin(box: List) -> List:
-    if len(box) == 4:
-        return box
-    x1, y1, x2, y2, x3, y3, x4, y4 = box
-    x_min = min(x1, x2, x3, x4)
-    x_max = max(x1, x2, x3, x4)
-    y_min = min(y1, y2, y3, y4)
-    y_max = max(y1, y2, y3, y4)
-    return [x_min, y_min, x_max, y_max]
+    def extra_repr(self) -> str:
+        return f"train={self.train}"
+
+    def _read_from_folder(self, path: str) -> None:
+        for img_path in glob.glob(os.path.join(path, "*.png")):
+            with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
+                self.data.append((img_path, f.read()))
+
+    @classmethod
+    def _convert_xmin_ymin(box: List) -> List:
+        if len(box) == 4:
+            return box
+        x1, y1, x2, y2, x3, y3, x4, y4 = box
+        x_min = min(x1, x2, x3, x4)
+        x_max = max(x1, x2, x3, x4)
+        y_min = min(y1, y2, y3, y4)
+        y_max = max(y1, y2, y3, y4)
+        return [x_min, y_min, x_max, y_max]

From 3c7ce8d71a21fb48320bb1ffb3365d219931f07a Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Wed, 18 Oct 2023 17:28:56 +0100
Subject: [PATCH 10/33] [ADD] mock_wildreceipt_dataset in conftest.py

---
 tests/conftest.py                 | 64 +++++++++++++++++++++++++++++++
 tests/pytorch/test_datasets_pt.py |  2 +-
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index c76e9393c7..d1091ffee9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -654,3 +654,67 @@ def mock_iiithws_dataset(tmpdir_factory, mock_image_stream):
         with open(fn, "wb") as f:
             f.write(file.getbuffer())
     return str(root), str(label_file)
+
+@pytest.fixture(scope="session")
+def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
+    file = BytesIO(mock_image_stream)
+    root = tmpdir_factory.mktemp("datasets")
+    wildreceipt_root = root.mkdir("wildreceipt")
+    annotations_folder = wildreceipt_root
+    image_folder = wildreceipt_root.mkdir("image_files")
+    labels = {
+  "file_name": "image_files/Image_58/20/6aa1a16efcc7eb138bfee9ca90df704ef7498b54.jpeg",
+  "height": 1000,
+  "width": 562,
+  "annotations": [
+    {
+      "box": [
+        200.0,
+        16.0,
+        317.0,
+        16.0,
+        317.0,
+        0.0,
+        200.0,
+        0.0
+      ],
+      "text": "",
+      "label": 0
+    },
+    {
+      "box": [
+        90.0,
+        73.0,
+        187.0,
+        73.0,
+        187.0,
+        46.0,
+        90.0,
+        46.0
+      ],
+      "text": "Server:Alex",
+      "label": 25
+    },
+    {
+      "box": [
+        92.0,
+        95.0,
+        222.0,
+        95.0,
+        222.0,
+        67.0,
+        92.0,
+        67.0
+      ],
+      "text": "Cashier:Sabrina",
+      "label": 25
+    }]
+}
+
+    annotation_file = annotations_folder.join("train.txt")
+    with open(annotation_file, "w") as f:
+        json.dump(labels, f)
+    fn_i = image_folder.join(labels["file_name"])
+    with open(fn_i, "wb") as f:
+        f.write(file.getbuffer())
+    return str(image_folder), str(annotation_file)
\ No newline at end of file
diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
index cdd8811de5..6b1636de58 100644
--- a/tests/pytorch/test_datasets_pt.py
+++ b/tests/pytorch/test_datasets_pt.py
@@ -559,7 +559,7 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset):
         [[32, 128], 5, True],
     ],
 )
-def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt):
+def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset):
     ds = datasets.WILDRECEIPT(
         *mock_wildreceipt,
         train=True,

From b7d8cb7096013127215a9be7c2851546e75a2fe4 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Wed, 18 Oct 2023 17:41:45 +0100
Subject: [PATCH 11/33] [BUG] mock_wildreceipt_dataset in conftest.py

---
 tests/conftest.py                 | 5 ++++-
 tests/pytorch/test_datasets_pt.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index d1091ffee9..852d6383c8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -662,6 +662,8 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     wildreceipt_root = root.mkdir("wildreceipt")
     annotations_folder = wildreceipt_root
     image_folder = wildreceipt_root.mkdir("image_files")
+    image_folder = image_folder.mkdir("Image_58")
+    image_folder = image_folder.mkdir("20")
     labels = {
   "file_name": "image_files/Image_58/20/6aa1a16efcc7eb138bfee9ca90df704ef7498b54.jpeg",
   "height": 1000,
@@ -714,7 +716,8 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     annotation_file = annotations_folder.join("train.txt")
     with open(annotation_file, "w") as f:
         json.dump(labels, f)
-    fn_i = image_folder.join(labels["file_name"])
+    fn_i = root.join(labels["file_name"])
+    # FIXME: this one does not create the file
     with open(fn_i, "wb") as f:
         f.write(file.getbuffer())
     return str(image_folder), str(annotation_file)
\ No newline at end of file
diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
index 6b1636de58..e9cd6418b5 100644
--- a/tests/pytorch/test_datasets_pt.py
+++ b/tests/pytorch/test_datasets_pt.py
@@ -561,7 +561,7 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset):
 )
 def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset):
     ds = datasets.WILDRECEIPT(
-        *mock_wildreceipt,
+        *mock_wildreceipt_dataset,
         train=True,
         img_transforms=Resize(input_size),
         use_polygons=rotate,

From a1f09b054d17bda7ceae581266016b21a5ff477a Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Wed, 18 Oct 2023 18:41:52 +0100
Subject: [PATCH 12/33] [BUG] mock_wildreceipt_dataset in conftest.py

---
 tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 852d6383c8..aad1fa3e6b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -719,5 +719,5 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     fn_i = root.join(labels["file_name"])
     # FIXME: this one does not create the file
     with open(fn_i, "wb") as f:
-        f.write(file.getbuffer())
+        f.write(file.read())
     return str(image_folder), str(annotation_file)
\ No newline at end of file

From e3b9bdc09063370526dafd4af345b241d1fd0d6b Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Wed, 18 Oct 2023 21:51:37 +0100
Subject: [PATCH 13/33] [BUG] mock_wildreceipt_dataset in conftest.py

---
 doctr/datasets/wildreceipt.py     |  2 +-
 tests/conftest.py                 | 17 ++++++++++++-----
 tests/pytorch/test_datasets_pt.py |  4 ++--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 4def66aa6e..95d5b693a5 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -112,7 +112,7 @@ def _read_from_folder(self, path: str) -> None:
             with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
                 self.data.append((img_path, f.read()))
 
-    @classmethod
+    @staticmethod
     def _convert_xmin_ymin(box: List) -> List:
         if len(box) == 4:
             return box
diff --git a/tests/conftest.py b/tests/conftest.py
index aad1fa3e6b..efe9281f77 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 import json
+import os
 import shutil
 import tempfile
 from io import BytesIO
@@ -662,10 +663,10 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     wildreceipt_root = root.mkdir("wildreceipt")
     annotations_folder = wildreceipt_root
     image_folder = wildreceipt_root.mkdir("image_files")
-    image_folder = image_folder.mkdir("Image_58")
-    image_folder = image_folder.mkdir("20")
+    # image_folder = image_folder.mkdir("Image_58")
+    # image_folder = image_folder.mkdir("20")
     labels = {
-  "file_name": "image_files/Image_58/20/6aa1a16efcc7eb138bfee9ca90df704ef7498b54.jpeg",
+  "file_name": "receipt_1.png",
   "height": 1000,
   "width": 562,
   "annotations": [
@@ -716,8 +717,14 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     annotation_file = annotations_folder.join("train.txt")
     with open(annotation_file, "w") as f:
         json.dump(labels, f)
-    fn_i = root.join(labels["file_name"])
+    # fn_i = root.join(labels["file_name"])
+    # os.makedirs(os.path.dirname(fn_i), exist_ok=True)
     # FIXME: this one does not create the file
+    file = BytesIO(mock_image_stream)
+    fn_i = image_folder.join(f"receipt_1.png")
     with open(fn_i, "wb") as f:
-        f.write(file.read())
+        f.write(file.getbuffer())
+    fn_l = annotations_folder.join(f"receipt_1.json")
+    with open(fn_l, "w") as f:
+        json.dump(labels, f)
     return str(image_folder), str(annotation_file)
\ No newline at end of file
diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
index e9cd6418b5..c3209b4d24 100644
--- a/tests/pytorch/test_datasets_pt.py
+++ b/tests/pytorch/test_datasets_pt.py
@@ -555,8 +555,8 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset):
 @pytest.mark.parametrize(
     "input_size, num_samples, recognition",
     [
-        [[512, 512], 3, False],
-        [[32, 128], 5, True],
+        [[512, 512], 2, False],
+        [[32, 128], 2, True],
     ],
 )
 def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset):

From 8c57b75f606f514547638b4d0cb3696d8b09030c Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Wed, 18 Oct 2023 21:52:07 +0100
Subject: [PATCH 14/33] [BUG] mock_wildreceipt_dataset in conftest.py

---
 tests/pytorch/test_datasets_pt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
index c3209b4d24..15e8e8f841 100644
--- a/tests/pytorch/test_datasets_pt.py
+++ b/tests/pytorch/test_datasets_pt.py
@@ -573,6 +573,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_
     if recognition:
         _validate_dataset_recognition_part(ds, input_size)
     else:
+        # FIXME: you have an error here
         _validate_dataset(ds, input_size, is_polygons=rotate)
 
 # NOTE: following datasets are only for recognition task

From 630437d907cf11d2189dfe829af3d1cdbbd85298 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 19 Oct 2023 09:58:50 +0100
Subject: [PATCH 15/33] [BUG] mock_wildreceipt_dataset in conftest.py

---
 tests/pytorch/test_datasets_pt.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
index 15e8e8f841..c457fee582 100644
--- a/tests/pytorch/test_datasets_pt.py
+++ b/tests/pytorch/test_datasets_pt.py
@@ -14,6 +14,10 @@
 def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_polygons=False):
     # Fetch one sample
     img, target = ds[0]
+    # TODO: ADD More then one element
+    print(f"img, label 0 {ds[0]}")
+    print(f"img, label 1 {ds[1]}")
+
     assert isinstance(img, torch.Tensor)
     assert img.shape == (3, *input_size)
     assert img.dtype == torch.float32
@@ -40,6 +44,9 @@ def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_poly
         pin_memory=True,
         collate_fn=ds.collate_fn,
     )
+    print(f"loder is {loader}")
+    print(f"loder is type {type(loader)}")
+    print(f"next(iter(loader)) {next(iter(loader))}")
 
     images, targets = next(iter(loader))
     assert isinstance(images, torch.Tensor) and images.shape == (batch_size, 3, *input_size)
@@ -49,6 +56,7 @@ def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_poly
 def _validate_dataset_recognition_part(ds, input_size, batch_size=2):
     # Fetch one sample
     img, label = ds[0]
+
     assert isinstance(img, torch.Tensor)
     assert img.shape == (3, *input_size)
     assert img.dtype == torch.float32

From 15804df5bb1d2407adf981454561408f06a7f75c Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Tue, 24 Oct 2023 22:31:08 +0100
Subject: [PATCH 16/33] [BUG] mock_wildreceipt_dataset in conftest.py

---
 doctr/datasets/wildreceipt.py |   1 +
 tests/conftest.py             | 131 ++++++++++++++++++++--------------
 2 files changed, 79 insertions(+), 53 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 95d5b693a5..661ae4a754 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -71,6 +71,7 @@ def __init__(
         box: Union[List[float], np.ndarray]
         _targets = []
         for json_string in json_strings:
+            # FIXME there is a bug here check it with the unit test
             json_data = json.loads(json_string)
             img_path = json_data['file_name']
             annotations = json_data['annotations']
diff --git a/tests/conftest.py b/tests/conftest.py
index efe9281f77..271dc64a74 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -666,65 +666,90 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     # image_folder = image_folder.mkdir("Image_58")
     # image_folder = image_folder.mkdir("20")
     labels = {
-  "file_name": "receipt_1.png",
-  "height": 1000,
-  "width": 562,
-  "annotations": [
-    {
-      "box": [
-        200.0,
-        16.0,
-        317.0,
-        16.0,
-        317.0,
-        0.0,
-        200.0,
-        0.0
-      ],
-      "text": "",
-      "label": 0
-    },
-    {
-      "box": [
-        90.0,
-        73.0,
-        187.0,
-        73.0,
-        187.0,
-        46.0,
-        90.0,
-        46.0
-      ],
-      "text": "Server:Alex",
-      "label": 25
-    },
-    {
-      "box": [
-        92.0,
-        95.0,
-        222.0,
-        95.0,
-        222.0,
-        67.0,
-        92.0,
-        67.0
-      ],
-      "text": "Cashier:Sabrina",
-      "label": 25
-    }]
-}
+      "file_name": "receipt_0.png",
+      "height": 348,
+      "width": 348,
+      "annotations": [
+
+        {
+          "box": [
+            263.0,
+            283.0,
+            325.0,
+            283.0,
+            325.0,
+            260.0,
+            263.0,
+            260.0
+          ],
+          "text": "$55.96",
+          "label": 17
+        },
+        {
+          "box": [
+            274.0,
+            308.0,
+            326.0,
+            308.0,
+            326.0,
+            286.0,
+            274.0,
+            286.0
+          ],
+          "text": "$4.48",
+          "label": 19
+        }
+      ]
+    }
+    labels2 = {
+      "file_name": "receipt_1.png",
+      "height": 1720,
+      "width": 856,
+      "annotations": [
+        {
+          "box": [
+            511.0,
+            738.0,
+            527.0,
+            738.0,
+            527.0,
+            713.0,
+            511.0,
+            713.0
+          ],
+          "text": "a",
+          "label": 25
+        },
+        {
+          "box": [
+            386.0,
+            409.0,
+            599.0,
+            409.0,
+            599.0,
+            373.0,
+            386.0,
+            373.0
+          ],
+          "text": "089-46169340",
+          "label": 5
+        }
+      ]
+    }
 
     annotation_file = annotations_folder.join("train.txt")
     with open(annotation_file, "w") as f:
         json.dump(labels, f)
+        json.dump(labels2, f)
     # fn_i = root.join(labels["file_name"])
     # os.makedirs(os.path.dirname(fn_i), exist_ok=True)
     # FIXME: this one does not create the file
     file = BytesIO(mock_image_stream)
-    fn_i = image_folder.join(f"receipt_1.png")
-    with open(fn_i, "wb") as f:
-        f.write(file.getbuffer())
-    fn_l = annotations_folder.join(f"receipt_1.json")
-    with open(fn_l, "w") as f:
-        json.dump(labels, f)
+    for i in range(2):
+        fn_i = image_folder.join(f"receipt_{i}.png")
+        with open(fn_i, "wb") as f:
+            f.write(file.getbuffer())
+        # fn_l = annotations_folder.join(f"receipt_{i}.json")
+        # with open(fn_l, "w") as f:
+        #     json.dump(labels, f)
     return str(image_folder), str(annotation_file)
\ No newline at end of file

From 275afa52c48aa36b2563792798236bbc7ba5378f Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Wed, 25 Oct 2023 21:55:14 +0100
Subject: [PATCH 17/33] [FIX] mock_wildreceipt_dataset labels

---
 tests/conftest.py | 68 ++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 36 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 271dc64a74..2d52f46bcd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -666,7 +666,7 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     # image_folder = image_folder.mkdir("Image_58")
     # image_folder = image_folder.mkdir("20")
     labels = {
-      "file_name": "receipt_0.png",
+      "file_name": "receipt_0.jpeg",
       "height": 348,
       "width": 348,
       "annotations": [
@@ -702,40 +702,36 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
       ]
     }
     labels2 = {
-      "file_name": "receipt_1.png",
-      "height": 1720,
-      "width": 856,
-      "annotations": [
-        {
-          "box": [
-            511.0,
-            738.0,
-            527.0,
-            738.0,
-            527.0,
-            713.0,
-            511.0,
-            713.0
-          ],
-          "text": "a",
-          "label": 25
-        },
-        {
-          "box": [
-            386.0,
-            409.0,
-            599.0,
-            409.0,
-            599.0,
-            373.0,
-            386.0,
-            373.0
-          ],
-          "text": "089-46169340",
-          "label": 5
-        }
-      ]
-    }
+  "file_name": "receipt_1.jpeg",
+  "height": 348,
+  "width": 348,
+  "annotations": [
+    {
+      "box": [
+        263.0,
+        283.0,
+        325.0,
+        283.0,
+        325
+       ],
+       "label": 25
+      },
+      {
+        "box": [
+          386.0,
+          409.0,
+          599.0,
+          409.0,
+          599.0,
+          373.0,
+          386.0,
+          373.0
+        ],
+        "text": "089-46169340",
+        "label": 5
+      }
+    ]
+  }
 
     annotation_file = annotations_folder.join("train.txt")
     with open(annotation_file, "w") as f:
@@ -746,7 +742,7 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     # FIXME: this one does not create the file
     file = BytesIO(mock_image_stream)
     for i in range(2):
-        fn_i = image_folder.join(f"receipt_{i}.png")
+        fn_i = image_folder.join(f"receipt_{i}.jpeg")
         with open(fn_i, "wb") as f:
             f.write(file.getbuffer())
         # fn_l = annotations_folder.join(f"receipt_{i}.json")

From 82ed210b63c6d534b29ae58e3ec0e4049b0352b9 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Wed, 25 Oct 2023 22:19:07 +0100
Subject: [PATCH 18/33] [FIX] mock_wildreceipt_dataset labels

---
 tests/conftest.py                 | 13 ++-----------
 tests/pytorch/test_datasets_pt.py |  4 ++--
 2 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 2d52f46bcd..9d422b4412 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -670,7 +670,6 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
       "height": 348,
       "width": 348,
       "annotations": [
-
         {
           "box": [
             263.0,
@@ -706,16 +705,6 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
   "height": 348,
   "width": 348,
   "annotations": [
-    {
-      "box": [
-        263.0,
-        283.0,
-        325.0,
-        283.0,
-        325
-       ],
-       "label": 25
-      },
       {
         "box": [
           386.0,
@@ -736,7 +725,9 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     annotation_file = annotations_folder.join("train.txt")
     with open(annotation_file, "w") as f:
         json.dump(labels, f)
+        f.write("\n")
         json.dump(labels2, f)
+        f.write("\n")
     # fn_i = root.join(labels["file_name"])
     # os.makedirs(os.path.dirname(fn_i), exist_ok=True)
     # FIXME: this one does not create the file
diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
index c457fee582..8de0450920 100644
--- a/tests/pytorch/test_datasets_pt.py
+++ b/tests/pytorch/test_datasets_pt.py
@@ -564,7 +564,7 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset):
     "input_size, num_samples, recognition",
     [
         [[512, 512], 2, False],
-        [[32, 128], 2, True],
+        [[32, 128], 1, True],
     ],
 )
 def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset):
@@ -576,7 +576,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_
         recognition_task=recognition,
     )
     # TODO: FINISH THIS
-    assert len(ds) == num_samples - 1  # -1 because of the test set 90 / 10 split
+    assert len(ds) == num_samples
     assert repr(ds) == f"WILDRECEIPT(train={True})"
     if recognition:
         _validate_dataset_recognition_part(ds, input_size)

From 1e06371a5f745045fd1cd22292271f5d7a18e2cd Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Wed, 25 Oct 2023 22:34:14 +0100
Subject: [PATCH 19/33] [FIX] mock_wildreceipt_dataset labels

---
 doctr/datasets/wildreceipt.py     | 22 +++++++++++++++++++++-
 tests/pytorch/test_datasets_pt.py |  3 ++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 661ae4a754..6a3becf3db 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -64,6 +64,19 @@ def __init__(
         np_dtype = np.float32
         self.data: List[Tuple[str, Dict[str, Any]]] = []
 
+
+        # define folder to write IMGUR5K recognition dataset
+        reco_folder_name = "WILDRECEIPT_recognition_train" if self.train else "WILDRECEIPT_recognition_test"
+        reco_folder_name = "Poly_" + reco_folder_name if use_polygons else reco_folder_name
+        reco_folder_path = os.path.join(os.path.dirname(self.root), reco_folder_name)
+        reco_images_counter = 0
+
+        if recognition_task and os.path.isdir(reco_folder_path):
+            self._read_from_folder(reco_folder_path)
+            return
+        elif recognition_task and not os.path.isdir(reco_folder_path):
+            os.makedirs(reco_folder_path, exist_ok=False)
+
         with open(label_path, 'r') as file:
             data = file.read()
         # Split the text file into separate JSON strings
@@ -98,11 +111,18 @@ def __init__(
                     img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
                 )
                 for crop, label in zip(crops, list(text_targets)):
-                    self.data.append((crop, label))
+                    with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
+                        f.write(label)
+                        tmp_img = Image.fromarray(crop)
+                        tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
+                        reco_images_counter += 1
+                    # self.data.append((crop, label))
             else:
                 self.data.append(
                     (img_path, dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)))
                 )
+        if recognition_task:
+            self._read_from_folder(reco_folder_path)
         self.root = tmp_root
 
     def extra_repr(self) -> str:
diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
index 8de0450920..80c5159680 100644
--- a/tests/pytorch/test_datasets_pt.py
+++ b/tests/pytorch/test_datasets_pt.py
@@ -564,7 +564,7 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset):
     "input_size, num_samples, recognition",
     [
         [[512, 512], 2, False],
-        [[32, 128], 1, True],
+        [[32, 128], 5, True],
     ],
 )
 def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset):
@@ -576,6 +576,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_
         recognition_task=recognition,
     )
     # TODO: FINISH THIS
+    print(f"recognition {recognition}")
     assert len(ds) == num_samples
     assert repr(ds) == f"WILDRECEIPT(train={True})"
     if recognition:

From a968db487bfe3c0b4dfec6db3cac82192d9d5bc6 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Wed, 25 Oct 2023 22:35:35 +0100
Subject: [PATCH 20/33] remove todos

---
 tests/conftest.py                 | 9 +--------
 tests/pytorch/test_datasets_pt.py | 9 ---------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 9d422b4412..56fc369c1e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -663,8 +663,7 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     wildreceipt_root = root.mkdir("wildreceipt")
     annotations_folder = wildreceipt_root
     image_folder = wildreceipt_root.mkdir("image_files")
-    # image_folder = image_folder.mkdir("Image_58")
-    # image_folder = image_folder.mkdir("20")
+
     labels = {
       "file_name": "receipt_0.jpeg",
       "height": 348,
@@ -728,15 +727,9 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
         f.write("\n")
         json.dump(labels2, f)
         f.write("\n")
-    # fn_i = root.join(labels["file_name"])
-    # os.makedirs(os.path.dirname(fn_i), exist_ok=True)
-    # FIXME: this one does not create the file
     file = BytesIO(mock_image_stream)
     for i in range(2):
         fn_i = image_folder.join(f"receipt_{i}.jpeg")
         with open(fn_i, "wb") as f:
             f.write(file.getbuffer())
-        # fn_l = annotations_folder.join(f"receipt_{i}.json")
-        # with open(fn_l, "w") as f:
-        #     json.dump(labels, f)
     return str(image_folder), str(annotation_file)
\ No newline at end of file
diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
index 80c5159680..2168ea0e3f 100644
--- a/tests/pytorch/test_datasets_pt.py
+++ b/tests/pytorch/test_datasets_pt.py
@@ -14,9 +14,6 @@
 def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_polygons=False):
     # Fetch one sample
     img, target = ds[0]
-    # TODO: ADD More then one element
-    print(f"img, label 0 {ds[0]}")
-    print(f"img, label 1 {ds[1]}")
 
     assert isinstance(img, torch.Tensor)
     assert img.shape == (3, *input_size)
@@ -44,9 +41,6 @@ def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_poly
         pin_memory=True,
         collate_fn=ds.collate_fn,
     )
-    print(f"loder is {loader}")
-    print(f"loder is type {type(loader)}")
-    print(f"next(iter(loader)) {next(iter(loader))}")
 
     images, targets = next(iter(loader))
     assert isinstance(images, torch.Tensor) and images.shape == (batch_size, 3, *input_size)
@@ -575,14 +569,11 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_
         use_polygons=rotate,
         recognition_task=recognition,
     )
-    # TODO: FINISH THIS
-    print(f"recognition {recognition}")
     assert len(ds) == num_samples
     assert repr(ds) == f"WILDRECEIPT(train={True})"
     if recognition:
         _validate_dataset_recognition_part(ds, input_size)
     else:
-        # FIXME: you have an error here
         _validate_dataset(ds, input_size, is_polygons=rotate)
 
 # NOTE: following datasets are only for recognition task

From e42c71e6d3fb6b4427b180fd016a267248061957 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Wed, 25 Oct 2023 22:36:40 +0100
Subject: [PATCH 21/33] remove todos

---
 doctr/datasets/wildreceipt.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 6a3becf3db..7bb3c8de3e 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -84,7 +84,6 @@ def __init__(
         box: Union[List[float], np.ndarray]
         _targets = []
         for json_string in json_strings:
-            # FIXME there is a bug here check it with the unit test
             json_data = json.loads(json_string)
             img_path = json_data['file_name']
             annotations = json_data['annotations']

From 4ec3bf52cc8029663f5f0d765231ad8160577055 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 26 Oct 2023 10:24:21 +0100
Subject: [PATCH 22/33] [UPDATE] wildreceipt_image_folder

---
 tests/conftest.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 56fc369c1e..9439b07282 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -665,7 +665,7 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     image_folder = wildreceipt_root.mkdir("image_files")
 
     labels = {
-      "file_name": "receipt_0.jpeg",
+      "file_name": "Image_58/20/receipt_0.jpeg",
       "height": 348,
       "width": 348,
       "annotations": [
@@ -700,7 +700,7 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
       ]
     }
     labels2 = {
-  "file_name": "receipt_1.jpeg",
+  "file_name": "Image_58/20/receipt_1.jpeg",
   "height": 348,
   "width": 348,
   "annotations": [
@@ -728,8 +728,10 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
         json.dump(labels2, f)
         f.write("\n")
     file = BytesIO(mock_image_stream)
+    wildreceipt_image_folder = image_folder.mkdir("Image_58")
+    wildreceipt_image_folder = wildreceipt_image_folder.mkdir("20")
     for i in range(2):
-        fn_i = image_folder.join(f"receipt_{i}.jpeg")
+        fn_i = wildreceipt_image_folder.join(f"receipt_{i}.jpeg")
         with open(fn_i, "wb") as f:
             f.write(file.getbuffer())
     return str(image_folder), str(annotation_file)
\ No newline at end of file

From ff4b3998003dda4d19bd75181aea097165ecae08 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 26 Oct 2023 10:43:03 +0100
Subject: [PATCH 23/33] [ADD] test_wildreceipt_dataset tf

---
 tests/tensorflow/test_datasets_tf.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/tensorflow/test_datasets_tf.py b/tests/tensorflow/test_datasets_tf.py
index 6a89c89263..ac426e4bd0 100644
--- a/tests/tensorflow/test_datasets_tf.py
+++ b/tests/tensorflow/test_datasets_tf.py
@@ -535,6 +535,29 @@ def test_ic03(input_size, num_samples, rotate, recognition, mock_ic03_dataset):
         _validate_dataset(ds, input_size, is_polygons=rotate)
 
 
+@pytest.mark.parametrize("rotate", [True, False])
+@pytest.mark.parametrize(
+    "input_size, num_samples, recognition",
+    [
+        [[512, 512], 2, False],
+        [[32, 128], 5, True],
+    ],
+)
+def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_wildreceipt_dataset):
+    ds = datasets.WILDRECEIPT(
+        *mock_wildreceipt_dataset,
+        train=True,
+        img_transforms=Resize(input_size),
+        use_polygons=rotate,
+        recognition_task=recognition,
+    )
+    assert len(ds) == num_samples
+    assert repr(ds) == f"WILDRECEIPT(train={True})"
+    if recognition:
+        _validate_dataset_recognition_part(ds, input_size)
+    else:
+        _validate_dataset(ds, input_size, is_polygons=rotate)
+
 # NOTE: following datasets are only for recognition task
 
 

From 2a7d1e00db91ba39e9e0b20baaebdea15e64305f Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 26 Oct 2023 10:43:37 +0100
Subject: [PATCH 24/33] [UPDATE] WILDRECEIPT optimize imports

---
 doctr/datasets/wildreceipt.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 7bb3c8de3e..85359a5e8a 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -6,13 +6,10 @@
 import glob
 import json
 import os
-from pathlib import Path
 from typing import Any, Dict, List, Tuple, Union
 
-import cv2
 import numpy as np
 from PIL import Image
-from tqdm import tqdm
 
 from .datasets import AbstractDataset
 from .utils import convert_target_to_relative, crop_bboxes_from_image

From bffca24aad0eccdfa7eadad9dd45ec63a54d346c Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 26 Oct 2023 10:55:07 +0100
Subject: [PATCH 25/33] [FIX] WILDRECEIPT self.data

---
 doctr/datasets/wildreceipt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 85359a5e8a..5fdded8adf 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -6,6 +6,7 @@
 import glob
 import json
 import os
+from pathlib import Path
 from typing import Any, Dict, List, Tuple, Union
 
 import numpy as np
@@ -59,8 +60,7 @@ def __init__(
         tmp_root = img_folder
         self.train = train
         np_dtype = np.float32
-        self.data: List[Tuple[str, Dict[str, Any]]] = []
-
+        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
 
         # define folder to write IMGUR5K recognition dataset
         reco_folder_name = "WILDRECEIPT_recognition_train" if self.train else "WILDRECEIPT_recognition_test"

From 954b8b02126a968cffccd62dfd872cdbcab257b0 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 26 Oct 2023 14:50:32 +0100
Subject: [PATCH 26/33] [UPDATE] save fata in RAM

---
 doctr/datasets/wildreceipt.py | 48 +++++++----------------------------
 1 file changed, 9 insertions(+), 39 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 5fdded8adf..96bd8e8658 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -17,13 +17,15 @@
 
 __all__ = ["WILDRECEIPT"]
 
+from ..utils import polygon_to_bbox
+
 
 class WILDRECEIPT(AbstractDataset):
-    """WildReceipt is a collection of receipts. It contains, for each photo, of a list of OCRs - with bounding box, text, and class."
-    <https://arxiv.org/abs/2103.14470v1>`_ |
+    """WildReceipt dataset from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction"
+        <https://arxiv.org/abs/2103.14470v1>`_ |
     `repository <https://download.openmmlab.com/mmocr/data/wildreceipt.tar>`_.
 
-    >>> # NOTE: You need to download/generate the dataset from the repository.
+    >>> # NOTE: You need to download the dataset from the repository.
     >>> from doctr.datasets import WILDRECEIPT
     >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/",
     >>>                     label_path="/path/to/wildreceipt/train.txt")
@@ -62,17 +64,6 @@ def __init__(
         np_dtype = np.float32
         self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
 
-        # define folder to write IMGUR5K recognition dataset
-        reco_folder_name = "WILDRECEIPT_recognition_train" if self.train else "WILDRECEIPT_recognition_test"
-        reco_folder_name = "Poly_" + reco_folder_name if use_polygons else reco_folder_name
-        reco_folder_path = os.path.join(os.path.dirname(self.root), reco_folder_name)
-        reco_images_counter = 0
-
-        if recognition_task and os.path.isdir(reco_folder_path):
-            self._read_from_folder(reco_folder_path)
-            return
-        elif recognition_task and not os.path.isdir(reco_folder_path):
-            os.makedirs(reco_folder_path, exist_ok=False)
 
         with open(label_path, 'r') as file:
             data = file.read()
@@ -98,7 +89,9 @@ def __init__(
                         dtype=np_dtype
                     )
                 else:
-                    box = self._convert_xmin_ymin(coordinates)
+                    box_targets = polygon_to_bbox(
+                        tuple((coordinates[i], coordinates[i + 1]) for i in range(0, len(coordinates), 2)))
+                    box = [coord for coords in box_targets for coord in coords]
                 _targets.append((annotation['text'], box))
             text_targets, box_targets = zip(*_targets)
 
@@ -107,35 +100,12 @@ def __init__(
                     img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
                 )
                 for crop, label in zip(crops, list(text_targets)):
-                    with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
-                        f.write(label)
-                        tmp_img = Image.fromarray(crop)
-                        tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
-                        reco_images_counter += 1
-                    # self.data.append((crop, label))
+                    self.data.append((crop, label))
             else:
                 self.data.append(
                     (img_path, dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)))
                 )
-        if recognition_task:
-            self._read_from_folder(reco_folder_path)
         self.root = tmp_root
 
     def extra_repr(self) -> str:
         return f"train={self.train}"
-
-    def _read_from_folder(self, path: str) -> None:
-        for img_path in glob.glob(os.path.join(path, "*.png")):
-            with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
-                self.data.append((img_path, f.read()))
-
-    @staticmethod
-    def _convert_xmin_ymin(box: List) -> List:
-        if len(box) == 4:
-            return box
-        x1, y1, x2, y2, x3, y3, x4, y4 = box
-        x_min = min(x1, x2, x3, x4)
-        x_max = max(x1, x2, x3, x4)
-        y_min = min(y1, y2, y3, y4)
-        y_max = max(y1, y2, y3, y4)
-        return [x_min, y_min, x_max, y_max]

From edbcaf2ad134c2e026d1212808a47f0931227ef0 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 26 Oct 2023 15:02:15 +0100
Subject: [PATCH 27/33] [UPDATE] docs

---
 docs/source/index.rst                      | 1 +
 docs/source/using_doctr/using_datasets.rst | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index bad3a46420..ab3e7fb4b8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -66,6 +66,7 @@ Supported datasets
 * IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" <https://github.com/facebookresearch/IMGUR5K-Handwriting-Dataset>`_.
 * MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" <https://www.robots.ox.ac.uk/~vgg/data/text/>`_.
 * IIITHWS from `"Generating Synthetic Data for Text Recognition" <https://github.com/kris314/hwnet>`_.
+* WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" <https://arxiv.org/pdf/2103.14470v1.pdf>`_.
 
 
 .. toctree::
diff --git a/docs/source/using_doctr/using_datasets.rst b/docs/source/using_doctr/using_datasets.rst
index 35146e576f..67fa8a6325 100644
--- a/docs/source/using_doctr/using_datasets.rst
+++ b/docs/source/using_doctr/using_datasets.rst
@@ -41,6 +41,8 @@ This datasets contains the information to train or validate a text detection mod
 +-----------------------------+---------------------------------+---------------------------------+----------------------------------+
 | IMGUR5K                     | 7149                            | 796                             | Handwritten / external resources |
 +-----------------------------+---------------------------------+---------------------------------+----------------------------------+
+| WILDRECEIPT                 | 1268                            | 472                             | external resources               |
++-----------------------------+---------------------------------+---------------------------------+----------------------------------+
 
 .. code:: python3
 
@@ -84,6 +86,8 @@ This datasets contains the information to train or validate a text recognition m
 +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
 | IIITHWS                     | 7141797                         | 793533                          | english / handwritten / external resources  |
 +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
+| WILDRECEIPT                 | 1268                            | 472                             | english / external resources                |
++-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
 
 .. code:: python3
 

From e257a29b0c25deb0c37245c4459969c6258eea32 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Thu, 26 Oct 2023 18:42:24 +0100
Subject: [PATCH 28/33] [UPDATE] box wildreceipt

---
 doctr/datasets/wildreceipt.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 96bd8e8658..90815c649c 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -89,9 +89,8 @@ def __init__(
                         dtype=np_dtype
                     )
                 else:
-                    box_targets = polygon_to_bbox(
-                        tuple((coordinates[i], coordinates[i + 1]) for i in range(0, len(coordinates), 2)))
-                    box = [coord for coords in box_targets for coord in coords]
+                    x, y = coordinates[::2], coordinates[1::2]
+                    box = [min(x), min(y), max(x), max(y)]
                 _targets.append((annotation['text'], box))
             text_targets, box_targets = zip(*_targets)
 

From 2b3a5780e28ac0f84334fca8ac8b73d5695cafa0 Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Fri, 27 Oct 2023 11:08:52 +0100
Subject: [PATCH 29/33] [UPDATE] docs

---
 docs/source/using_doctr/using_datasets.rst | 2 +-
 doctr/datasets/wildreceipt.py              | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/using_doctr/using_datasets.rst b/docs/source/using_doctr/using_datasets.rst
index 67fa8a6325..52c5f7e24d 100644
--- a/docs/source/using_doctr/using_datasets.rst
+++ b/docs/source/using_doctr/using_datasets.rst
@@ -86,7 +86,7 @@ This datasets contains the information to train or validate a text recognition m
 +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
 | IIITHWS                     | 7141797                         | 793533                          | english / handwritten / external resources  |
 +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
-| WILDRECEIPT                 | 1268                            | 472                             | english / external resources                |
+| WILDRECEIPT                 | 49377                           | 19598                           | english / external resources                |
 +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
 
 .. code:: python3
diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 90815c649c..107c211b6e 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -25,7 +25,7 @@ class WILDRECEIPT(AbstractDataset):
         <https://arxiv.org/abs/2103.14470v1>`_ |
     `repository <https://download.openmmlab.com/mmocr/data/wildreceipt.tar>`_.
 
-    >>> # NOTE: You need to download the dataset from the repository.
+    >>> # NOTE: You need to download the dataset first.
     >>> from doctr.datasets import WILDRECEIPT
     >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/",
     >>>                     label_path="/path/to/wildreceipt/train.txt")
@@ -99,7 +99,8 @@ def __init__(
                     img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
                 )
                 for crop, label in zip(crops, list(text_targets)):
-                    self.data.append((crop, label))
+                    if not any(char in label for char in ["", "-", "*", "/", "=", "#", "@"]):
+                        self.data.append((crop, label))
             else:
                 self.data.append(
                     (img_path, dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)))

From c18175bde41378c8c3a31d17b718a3fa3b307c1d Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Fri, 27 Oct 2023 11:20:14 +0100
Subject: [PATCH 30/33] [UPDATE] filter empty and whitespace

---
 doctr/datasets/wildreceipt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 107c211b6e..5fa5464b8b 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -99,7 +99,7 @@ def __init__(
                     img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
                 )
                 for crop, label in zip(crops, list(text_targets)):
-                    if not any(char in label for char in ["", "-", "*", "/", "=", "#", "@"]):
+                    if not any(char in label for char in ["", " "]):
                         self.data.append((crop, label))
             else:
                 self.data.append(

From 6c3379953495de3d974f1c606aa3c6b1eb5db0fd Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Fri, 27 Oct 2023 12:03:42 +0100
Subject: [PATCH 31/33] [UPDATE] filter empty and whitespace

---
 doctr/datasets/wildreceipt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 5fa5464b8b..9330e68a7c 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -99,7 +99,7 @@ def __init__(
                     img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
                 )
                 for crop, label in zip(crops, list(text_targets)):
-                    if not any(char in label for char in ["", " "]):
+                    if label and " " not in label:
                         self.data.append((crop, label))
             else:
                 self.data.append(

From fcedaba939407b35456dbe7b713c7d47320be91b Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Fri, 27 Oct 2023 12:08:19 +0100
Subject: [PATCH 32/33] [FIX] format

---
 doctr/datasets/wildreceipt.py        | 32 ++++++-------
 tests/conftest.py                    | 69 ++++++----------------------
 tests/pytorch/test_datasets_pt.py    |  3 +-
 tests/tensorflow/test_datasets_tf.py |  1 +
 4 files changed, 32 insertions(+), 73 deletions(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index 9330e68a7c..de394595fa 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -3,21 +3,18 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import glob
 import json
 import os
 from pathlib import Path
 from typing import Any, Dict, List, Tuple, Union
 
 import numpy as np
-from PIL import Image
 
 from .datasets import AbstractDataset
 from .utils import convert_target_to_relative, crop_bboxes_from_image
 
 __all__ = ["WILDRECEIPT"]
 
-from ..utils import polygon_to_bbox
 
 
 class WILDRECEIPT(AbstractDataset):
@@ -44,13 +41,13 @@ class WILDRECEIPT(AbstractDataset):
     """
 
     def __init__(
-            self,
-            img_folder: str,
-            label_path: str,
-            train: bool = True,
-            use_polygons: bool = False,
-            recognition_task: bool = False,
-            **kwargs: Any,
+        self,
+        img_folder: str,
+        label_path: str,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
     ) -> None:
         super().__init__(
             img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
@@ -64,19 +61,18 @@ def __init__(
         np_dtype = np.float32
         self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
 
-
-        with open(label_path, 'r') as file:
+        with open(label_path, "r") as file:
             data = file.read()
         # Split the text file into separate JSON strings
-        json_strings = data.strip().split('\n')
+        json_strings = data.strip().split("\n")
         box: Union[List[float], np.ndarray]
         _targets = []
         for json_string in json_strings:
             json_data = json.loads(json_string)
-            img_path = json_data['file_name']
-            annotations = json_data['annotations']
+            img_path = json_data["file_name"]
+            annotations = json_data["annotations"]
             for annotation in annotations:
-                coordinates = annotation['box']
+                coordinates = annotation["box"]
                 if use_polygons:
                     # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                     box = np.array(
@@ -86,12 +82,12 @@ def __init__(
                             [coordinates[4], coordinates[5]],
                             [coordinates[6], coordinates[7]],
                         ],
-                        dtype=np_dtype
+                        dtype=np_dtype,
                     )
                 else:
                     x, y = coordinates[::2], coordinates[1::2]
                     box = [min(x), min(y), max(x), max(y)]
-                _targets.append((annotation['text'], box))
+                _targets.append((annotation["text"], box))
             text_targets, box_targets = zip(*_targets)
 
             if recognition_task:
diff --git a/tests/conftest.py b/tests/conftest.py
index 9439b07282..4a18e9bb28 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,4 @@
 import json
-import os
 import shutil
 import tempfile
 from io import BytesIO
@@ -656,6 +655,7 @@ def mock_iiithws_dataset(tmpdir_factory, mock_image_stream):
             f.write(file.getbuffer())
     return str(root), str(label_file)
 
+
 @pytest.fixture(scope="session")
 def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     file = BytesIO(mock_image_stream)
@@ -665,61 +665,22 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
     image_folder = wildreceipt_root.mkdir("image_files")
 
     labels = {
-      "file_name": "Image_58/20/receipt_0.jpeg",
-      "height": 348,
-      "width": 348,
-      "annotations": [
-        {
-          "box": [
-            263.0,
-            283.0,
-            325.0,
-            283.0,
-            325.0,
-            260.0,
-            263.0,
-            260.0
-          ],
-          "text": "$55.96",
-          "label": 17
-        },
-        {
-          "box": [
-            274.0,
-            308.0,
-            326.0,
-            308.0,
-            326.0,
-            286.0,
-            274.0,
-            286.0
-          ],
-          "text": "$4.48",
-          "label": 19
-        }
-      ]
+        "file_name": "Image_58/20/receipt_0.jpeg",
+        "height": 348,
+        "width": 348,
+        "annotations": [
+            {"box": [263.0, 283.0, 325.0, 283.0, 325.0, 260.0, 263.0, 260.0], "text": "$55.96", "label": 17},
+            {"box": [274.0, 308.0, 326.0, 308.0, 326.0, 286.0, 274.0, 286.0], "text": "$4.48", "label": 19},
+        ],
     }
     labels2 = {
-  "file_name": "Image_58/20/receipt_1.jpeg",
-  "height": 348,
-  "width": 348,
-  "annotations": [
-      {
-        "box": [
-          386.0,
-          409.0,
-          599.0,
-          409.0,
-          599.0,
-          373.0,
-          386.0,
-          373.0
+        "file_name": "Image_58/20/receipt_1.jpeg",
+        "height": 348,
+        "width": 348,
+        "annotations": [
+            {"box": [386.0, 409.0, 599.0, 409.0, 599.0, 373.0, 386.0, 373.0], "text": "089-46169340", "label": 5}
         ],
-        "text": "089-46169340",
-        "label": 5
-      }
-    ]
-  }
+    }
 
     annotation_file = annotations_folder.join("train.txt")
     with open(annotation_file, "w") as f:
@@ -734,4 +695,4 @@ def mock_wildreceipt_dataset(tmpdir_factory, mock_image_stream):
         fn_i = wildreceipt_image_folder.join(f"receipt_{i}.jpeg")
         with open(fn_i, "wb") as f:
             f.write(file.getbuffer())
-    return str(image_folder), str(annotation_file)
\ No newline at end of file
+    return str(image_folder), str(annotation_file)
diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
index 2168ea0e3f..23d2a69fd1 100644
--- a/tests/pytorch/test_datasets_pt.py
+++ b/tests/pytorch/test_datasets_pt.py
@@ -576,6 +576,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_
     else:
         _validate_dataset(ds, input_size, is_polygons=rotate)
 
+
 # NOTE: following datasets are only for recognition task
 
 
@@ -600,4 +601,4 @@ def test_iiithws_dataset(mock_iiithws_dataset):
 
     assert len(ds) == 4  # Actual set has 7141797 train and 793533 test samples
     assert repr(ds) == f"IIITHWS(train={True})"
-    _validate_dataset_recognition_part(ds, input_size)
\ No newline at end of file
+    _validate_dataset_recognition_part(ds, input_size)
diff --git a/tests/tensorflow/test_datasets_tf.py b/tests/tensorflow/test_datasets_tf.py
index ac426e4bd0..e8121b9d38 100644
--- a/tests/tensorflow/test_datasets_tf.py
+++ b/tests/tensorflow/test_datasets_tf.py
@@ -558,6 +558,7 @@ def test_wildreceipt_dataset(input_size, num_samples, rotate, recognition, mock_
     else:
         _validate_dataset(ds, input_size, is_polygons=rotate)
 
+
 # NOTE: following datasets are only for recognition task
 
 

From 478a420561d41e2cf6aab386126d9da329eb03ab Mon Sep 17 00:00:00 2001
From: HamzaGbada <gbadahamza18@gmail.com>
Date: Fri, 27 Oct 2023 13:36:41 +0100
Subject: [PATCH 33/33] [FIX] format

---
 doctr/datasets/wildreceipt.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doctr/datasets/wildreceipt.py b/doctr/datasets/wildreceipt.py
index de394595fa..a802290110 100644
--- a/doctr/datasets/wildreceipt.py
+++ b/doctr/datasets/wildreceipt.py
@@ -16,7 +16,6 @@
 __all__ = ["WILDRECEIPT"]
 
 
-
 class WILDRECEIPT(AbstractDataset):
     """WildReceipt dataset from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction"
         <https://arxiv.org/abs/2103.14470v1>`_ |