diff --git a/.buildinfo b/.buildinfo index bd8b9a11ae..2e9e43e824 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 -# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: be0cf43efa4357c625c34906153a2192 +# This file records the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 7d20f70c4164b534d00ead21f34aefbc tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle index eb8cdbd0b8..8493876491 100644 Binary files a/.doctrees/environment.pickle and b/.doctrees/environment.pickle differ diff --git a/_modules/doctr/datasets/cord.html b/_modules/doctr/datasets/cord.html index ee9a8f8b5d..6ceaaa927b 100644 --- a/_modules/doctr/datasets/cord.html +++ b/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -458,7 +458,7 @@

Source code for doctr.datasets.cord

     
   
- + diff --git a/_modules/doctr/datasets/detection.html b/_modules/doctr/datasets/detection.html index 482649bc23..32525bc6a0 100644 --- a/_modules/doctr/datasets/detection.html +++ b/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/_modules/doctr/datasets/doc_artefacts.html b/_modules/doctr/datasets/doc_artefacts.html index d937490511..444fee3979 100644 --- a/_modules/doctr/datasets/doc_artefacts.html +++ b/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -410,7 +410,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/_modules/doctr/datasets/funsd.html b/_modules/doctr/datasets/funsd.html index 74188240c4..d8864cc816 100644 --- a/_modules/doctr/datasets/funsd.html +++ b/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -450,7 +450,7 @@

Source code for doctr.datasets.funsd

     
   
- + diff --git a/_modules/doctr/datasets/generator/tensorflow.html b/_modules/doctr/datasets/generator/tensorflow.html index 830ef4fb90..bc94cc9843 100644 --- a/_modules/doctr/datasets/generator/tensorflow.html +++ b/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -391,7 +391,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/_modules/doctr/datasets/ic03.html b/_modules/doctr/datasets/ic03.html index a3739563fc..c917a4da26 100644 --- a/_modules/doctr/datasets/ic03.html +++ b/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -464,7 +464,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/_modules/doctr/datasets/ic13.html b/_modules/doctr/datasets/ic13.html index 3ea8f023d6..47a68faf11 100644 --- a/_modules/doctr/datasets/ic13.html +++ b/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -436,7 +436,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/_modules/doctr/datasets/iiit5k.html b/_modules/doctr/datasets/iiit5k.html index 8a239870ee..57a1362e76 100644 --- a/_modules/doctr/datasets/iiit5k.html +++ b/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -441,7 +441,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/_modules/doctr/datasets/iiithws.html b/_modules/doctr/datasets/iiithws.html index 71bebe49c6..222cba3357 100644 --- a/_modules/doctr/datasets/iiithws.html +++ b/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -403,7 +403,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/_modules/doctr/datasets/imgur5k.html b/_modules/doctr/datasets/imgur5k.html index 0ff0c0b892..3eb805b32f 100644 --- a/_modules/doctr/datasets/imgur5k.html +++ b/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -484,7 +484,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/_modules/doctr/datasets/loader.html b/_modules/doctr/datasets/loader.html index 4507d39ae8..25f275995d 100644 --- a/_modules/doctr/datasets/loader.html +++ b/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -425,7 +425,7 @@

Source code for doctr.datasets.loader

     
   
- + diff --git a/_modules/doctr/datasets/mjsynth.html b/_modules/doctr/datasets/mjsynth.html index 7bd13241af..eda383dc50 100644 --- a/_modules/doctr/datasets/mjsynth.html +++ b/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -434,7 +434,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/_modules/doctr/datasets/ocr.html b/_modules/doctr/datasets/ocr.html index c64f5f3481..78e2491940 100644 --- a/_modules/doctr/datasets/ocr.html +++ b/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -399,7 +399,7 @@

Source code for doctr.datasets.ocr

     
   
- + diff --git a/_modules/doctr/datasets/recognition.html b/_modules/doctr/datasets/recognition.html index 0b4e6f2a77..b8230bab7f 100644 --- a/_modules/doctr/datasets/recognition.html +++ b/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -384,7 +384,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/_modules/doctr/datasets/sroie.html b/_modules/doctr/datasets/sroie.html index 320b8f3a62..940d9a430f 100644 --- a/_modules/doctr/datasets/sroie.html +++ b/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -441,7 +441,7 @@

Source code for doctr.datasets.sroie

     
   
- + diff --git a/_modules/doctr/datasets/svhn.html b/_modules/doctr/datasets/svhn.html index e67fd6e700..d950950d06 100644 --- a/_modules/doctr/datasets/svhn.html +++ b/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -469,7 +469,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/_modules/doctr/datasets/svt.html b/_modules/doctr/datasets/svt.html index 213c6ac946..5b0c12077d 100644 --- a/_modules/doctr/datasets/svt.html +++ b/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -455,7 +455,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/_modules/doctr/datasets/synthtext.html b/_modules/doctr/datasets/synthtext.html index 215d391089..e97d0dc347 100644 --- a/_modules/doctr/datasets/synthtext.html +++ b/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -466,7 +466,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/_modules/doctr/datasets/utils.html b/_modules/doctr/datasets/utils.html index e288bf6da2..09405434c6 100644 --- a/_modules/doctr/datasets/utils.html +++ b/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -550,7 +550,7 @@

Source code for doctr.datasets.utils

     
   
- + diff --git a/_modules/doctr/datasets/wildreceipt.html b/_modules/doctr/datasets/wildreceipt.html index e2ea3e24c1..6d840307cf 100644 --- a/_modules/doctr/datasets/wildreceipt.html +++ b/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -450,7 +450,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/_modules/doctr/io/elements.html b/_modules/doctr/io/elements.html index 68b2cdc9e5..452e16d666 100644 --- a/_modules/doctr/io/elements.html +++ b/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1004,7 +1004,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/_modules/doctr/io/html.html b/_modules/doctr/io/html.html index e890485c15..70e9b2eaa2 100644 --- a/_modules/doctr/io/html.html +++ b/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -356,7 +356,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/_modules/doctr/io/image/base.html b/_modules/doctr/io/image/base.html index 9549f39b01..fb939f7fb8 100644 --- a/_modules/doctr/io/image/base.html +++ b/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -384,7 +384,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/_modules/doctr/io/image/tensorflow.html b/_modules/doctr/io/image/tensorflow.html index e0687d9a4c..ba0232a43f 100644 --- a/_modules/doctr/io/image/tensorflow.html +++ b/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -441,7 +441,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/_modules/doctr/io/pdf.html b/_modules/doctr/io/pdf.html index b199e13bc5..be511b6179 100644 --- a/_modules/doctr/io/pdf.html +++ b/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -373,7 +373,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/_modules/doctr/io/reader.html b/_modules/doctr/io/reader.html index 6c99d69e55..9902fce224 100644 --- a/_modules/doctr/io/reader.html +++ b/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -422,7 +422,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 21de6b375c..9f0a1b3f07 100644 --- a/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -527,7 +527,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/_modules/doctr/models/classification/mobilenet/tensorflow.html b/_modules/doctr/models/classification/mobilenet/tensorflow.html index c6e0a49148..af866ac2c4 100644 --- a/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -789,7 +789,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/_modules/doctr/models/classification/resnet/tensorflow.html b/_modules/doctr/models/classification/resnet/tensorflow.html index aa9cdb2b68..2ea4320137 100644 --- a/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -745,7 +745,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/_modules/doctr/models/classification/textnet/tensorflow.html b/_modules/doctr/models/classification/textnet/tensorflow.html index 43a2f8ae7a..33c5e39f99 100644 --- a/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -607,7 +607,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/_modules/doctr/models/classification/vgg/tensorflow.html b/_modules/doctr/models/classification/vgg/tensorflow.html index 7044065cc4..e013c3e6bc 100644 --- a/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/_modules/doctr/models/classification/vit/tensorflow.html b/_modules/doctr/models/classification/vit/tensorflow.html index 257d6633e7..f7f12eebe5 100644 --- a/_modules/doctr/models/classification/vit/tensorflow.html +++ b/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -529,7 +529,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/_modules/doctr/models/classification/zoo.html b/_modules/doctr/models/classification/zoo.html index b70043b00c..3c0b2359b9 100644 --- a/_modules/doctr/models/classification/zoo.html +++ b/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -441,7 +441,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 9fd3c311ed..c1ff543e80 100644 --- a/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -755,7 +755,7 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo

- + diff --git a/_modules/doctr/models/detection/fast/tensorflow.html b/_modules/doctr/models/detection/fast/tensorflow.html index c4e8d515c9..e8b174a3e4 100644 --- a/_modules/doctr/models/detection/fast/tensorflow.html +++ b/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -765,7 +765,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/_modules/doctr/models/detection/linknet/tensorflow.html b/_modules/doctr/models/detection/linknet/tensorflow.html index cb782d7942..fa1cb26e38 100644 --- a/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -712,7 +712,7 @@

Source code for doctr.models.detection.linknet.tensorflow

- + diff --git a/_modules/doctr/models/detection/zoo.html b/_modules/doctr/models/detection/zoo.html index bfddc1ff93..de23a0b125 100644 --- a/_modules/doctr/models/detection/zoo.html +++ b/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -431,7 +431,7 @@

Source code for doctr.models.detection.zoo

     
   
- + diff --git a/_modules/doctr/models/factory/hub.html b/_modules/doctr/models/factory/hub.html index 9cc2ecf081..c1fc6eb0a3 100644 --- a/_modules/doctr/models/factory/hub.html +++ b/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -564,7 +564,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/_modules/doctr/models/recognition/crnn/tensorflow.html b/_modules/doctr/models/recognition/crnn/tensorflow.html index c862bf9f98..e42d517f47 100644 --- a/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -654,7 +654,7 @@

Source code for doctr.models.recognition.crnn.tensorflow

- + diff --git a/_modules/doctr/models/recognition/master/tensorflow.html b/_modules/doctr/models/recognition/master/tensorflow.html index 7ff32ab5a2..45783dfec0 100644 --- a/_modules/doctr/models/recognition/master/tensorflow.html +++ b/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -651,7 +651,7 @@

Source code for doctr.models.recognition.master.tensorflow

- + diff --git a/_modules/doctr/models/recognition/parseq/tensorflow.html b/_modules/doctr/models/recognition/parseq/tensorflow.html index bb550ce424..e7d86137e9 100644 --- a/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -842,7 +842,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/_modules/doctr/models/recognition/sar/tensorflow.html b/_modules/doctr/models/recognition/sar/tensorflow.html index 4feab4f1f1..ac20ea0ead 100644 --- a/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -753,7 +753,7 @@

Source code for doctr.models.recognition.sar.tensorflow

- + diff --git a/_modules/doctr/models/recognition/vitstr/tensorflow.html b/_modules/doctr/models/recognition/vitstr/tensorflow.html index 95585a34e7..1f3b49610e 100644 --- a/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -617,7 +617,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/_modules/doctr/models/recognition/zoo.html b/_modules/doctr/models/recognition/zoo.html index 41c2ce99fd..a6eb7d8bba 100644 --- a/_modules/doctr/models/recognition/zoo.html +++ b/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -403,7 +403,7 @@

Source code for doctr.models.recognition.zoo

   
- + diff --git a/_modules/doctr/models/zoo.html b/_modules/doctr/models/zoo.html index 73f90b65b6..30e26c4830 100644 --- a/_modules/doctr/models/zoo.html +++ b/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -572,7 +572,7 @@

Source code for doctr.models.zoo

     
   
- + diff --git a/_modules/doctr/transforms/modules/base.html b/_modules/doctr/transforms/modules/base.html index 70345f5f87..af786578c8 100644 --- a/_modules/doctr/transforms/modules/base.html +++ b/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -639,7 +639,7 @@

Source code for doctr.transforms.modules.base

- + diff --git a/_modules/doctr/transforms/modules/tensorflow.html b/_modules/doctr/transforms/modules/tensorflow.html index 61dda14a90..0c02c09da3 100644 --- a/_modules/doctr/transforms/modules/tensorflow.html +++ b/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -952,7 +952,7 @@

Source code for doctr.transforms.modules.tensorflow

- + diff --git a/_modules/doctr/utils/metrics.html b/_modules/doctr/utils/metrics.html index f1a8b695f4..754ae373c4 100644 --- a/_modules/doctr/utils/metrics.html +++ b/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -932,7 +932,7 @@

Source code for doctr.utils.metrics

     
   
- + diff --git a/_modules/doctr/utils/visualization.html b/_modules/doctr/utils/visualization.html index 0f00db325f..6791861bc8 100644 --- a/_modules/doctr/utils/visualization.html +++ b/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -716,7 +716,7 @@

Source code for doctr.utils.visualization

     
   
- + diff --git a/_modules/index.html b/_modules/index.html index 58e3fd832a..e1567f3e40 100644 --- a/_modules/index.html +++ b/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -374,7 +374,7 @@

All modules for which code is available

- + diff --git a/_static/basic.css b/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/_static/basic.css +++ b/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/_static/doctools.js b/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/_static/doctools.js +++ b/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/_static/language_data.js b/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/_static/language_data.js +++ b/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/_static/searchtools.js b/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/_static/searchtools.js +++ b/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/changelog.html b/changelog.html index 1d18911bb0..a716261488 100644 --- a/changelog.html +++ b/changelog.html @@ -14,7 +14,7 @@ - + Changelog - docTR documentation @@ -437,7 +437,7 @@

v0.1.0 (2021-03-05) - + diff --git a/contributing/code_of_conduct.html b/contributing/code_of_conduct.html index 7a6adcef5c..3254df539e 100644 --- a/contributing/code_of_conduct.html +++ b/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -500,7 +500,7 @@

Attribution - + diff --git a/contributing/contributing.html b/contributing/contributing.html index b3bbd6c13e..9c5c924996 100644 --- a/contributing/contributing.html +++ b/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -477,7 +477,7 @@

Let’s connect - + diff --git a/genindex.html b/genindex.html index f2e6342646..d73a14f26f 100644 --- a/genindex.html +++ b/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -752,7 +752,7 @@

W

- + diff --git a/getting_started/installing.html b/getting_started/installing.html index b453cf5583..e9d26117d9 100644 --- a/getting_started/installing.html +++ b/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -431,7 +431,7 @@

Via Git - + diff --git a/index.html b/index.html index b6cad4564d..0b2e522a63 100644 --- a/index.html +++ b/index.html @@ -14,7 +14,7 @@ - + docTR documentation @@ -439,7 +439,7 @@

Supported datasets - + diff --git a/latest/_modules/doctr/datasets/cord.html b/latest/_modules/doctr/datasets/cord.html index 78e70014e3..55b0584830 100644 --- a/latest/_modules/doctr/datasets/cord.html +++ b/latest/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -462,7 +462,7 @@

Source code for doctr.datasets.cord

     
   
- + diff --git a/latest/_modules/doctr/datasets/detection.html b/latest/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/latest/_modules/doctr/datasets/detection.html +++ b/latest/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/latest/_modules/doctr/datasets/doc_artefacts.html b/latest/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/latest/_modules/doctr/datasets/doc_artefacts.html +++ b/latest/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/latest/_modules/doctr/datasets/funsd.html b/latest/_modules/doctr/datasets/funsd.html index e52abc5428..f08612f9fa 100644 --- a/latest/_modules/doctr/datasets/funsd.html +++ b/latest/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.funsd

     
   
- + diff --git a/latest/_modules/doctr/datasets/generator/tensorflow.html b/latest/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/latest/_modules/doctr/datasets/generator/tensorflow.html +++ b/latest/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/latest/_modules/doctr/datasets/ic03.html b/latest/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/latest/_modules/doctr/datasets/ic03.html +++ b/latest/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/latest/_modules/doctr/datasets/ic13.html b/latest/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/latest/_modules/doctr/datasets/ic13.html +++ b/latest/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/latest/_modules/doctr/datasets/iiit5k.html b/latest/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/latest/_modules/doctr/datasets/iiit5k.html +++ b/latest/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/latest/_modules/doctr/datasets/iiithws.html b/latest/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/latest/_modules/doctr/datasets/iiithws.html +++ b/latest/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/latest/_modules/doctr/datasets/imgur5k.html b/latest/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/latest/_modules/doctr/datasets/imgur5k.html +++ b/latest/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/latest/_modules/doctr/datasets/loader.html b/latest/_modules/doctr/datasets/loader.html index d1785caa1c..ed80350ef0 100644 --- a/latest/_modules/doctr/datasets/loader.html +++ b/latest/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -429,7 +429,7 @@

Source code for doctr.datasets.loader

     
   
- + diff --git a/latest/_modules/doctr/datasets/mjsynth.html b/latest/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/latest/_modules/doctr/datasets/mjsynth.html +++ b/latest/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/latest/_modules/doctr/datasets/ocr.html b/latest/_modules/doctr/datasets/ocr.html index 5832933ea5..ce1ed8b0d4 100644 --- a/latest/_modules/doctr/datasets/ocr.html +++ b/latest/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -403,7 +403,7 @@

Source code for doctr.datasets.ocr

     
   
- + diff --git a/latest/_modules/doctr/datasets/recognition.html b/latest/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/latest/_modules/doctr/datasets/recognition.html +++ b/latest/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/latest/_modules/doctr/datasets/sroie.html b/latest/_modules/doctr/datasets/sroie.html index 94c963390e..04cf10bda2 100644 --- a/latest/_modules/doctr/datasets/sroie.html +++ b/latest/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.sroie

     
   
- + diff --git a/latest/_modules/doctr/datasets/svhn.html b/latest/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/latest/_modules/doctr/datasets/svhn.html +++ b/latest/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/latest/_modules/doctr/datasets/svt.html b/latest/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/latest/_modules/doctr/datasets/svt.html +++ b/latest/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/latest/_modules/doctr/datasets/synthtext.html b/latest/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/latest/_modules/doctr/datasets/synthtext.html +++ b/latest/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/latest/_modules/doctr/datasets/utils.html b/latest/_modules/doctr/datasets/utils.html index 9defb17ba5..bde9304597 100644 --- a/latest/_modules/doctr/datasets/utils.html +++ b/latest/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -554,7 +554,7 @@

Source code for doctr.datasets.utils

     
   
- + diff --git a/latest/_modules/doctr/datasets/wildreceipt.html b/latest/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/latest/_modules/doctr/datasets/wildreceipt.html +++ b/latest/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/latest/_modules/doctr/io/elements.html b/latest/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/latest/_modules/doctr/io/elements.html +++ b/latest/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/latest/_modules/doctr/io/html.html b/latest/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/latest/_modules/doctr/io/html.html +++ b/latest/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/latest/_modules/doctr/io/image/base.html b/latest/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/latest/_modules/doctr/io/image/base.html +++ b/latest/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/latest/_modules/doctr/io/image/tensorflow.html b/latest/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/latest/_modules/doctr/io/image/tensorflow.html +++ b/latest/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/latest/_modules/doctr/io/pdf.html b/latest/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/latest/_modules/doctr/io/pdf.html +++ b/latest/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/latest/_modules/doctr/io/reader.html b/latest/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/latest/_modules/doctr/io/reader.html +++ b/latest/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/latest/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/latest/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/latest/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/latest/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/latest/_modules/doctr/models/classification/mobilenet/tensorflow.html b/latest/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/latest/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/latest/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/latest/_modules/doctr/models/classification/resnet/tensorflow.html b/latest/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/latest/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/latest/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/latest/_modules/doctr/models/classification/textnet/tensorflow.html b/latest/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/latest/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/latest/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/latest/_modules/doctr/models/classification/vgg/tensorflow.html b/latest/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/latest/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/latest/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/latest/_modules/doctr/models/classification/vit/tensorflow.html b/latest/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/latest/_modules/doctr/models/classification/vit/tensorflow.html +++ b/latest/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/latest/_modules/doctr/models/classification/zoo.html b/latest/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/latest/_modules/doctr/models/classification/zoo.html +++ b/latest/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/latest/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/latest/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 4325d0b74a..66cef8663d 100644 --- a/latest/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/latest/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -759,7 +759,7 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo

- + diff --git a/latest/_modules/doctr/models/detection/fast/tensorflow.html b/latest/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/latest/_modules/doctr/models/detection/fast/tensorflow.html +++ b/latest/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/latest/_modules/doctr/models/detection/linknet/tensorflow.html b/latest/_modules/doctr/models/detection/linknet/tensorflow.html index dbb58e37cf..ce995f99d4 100644 --- a/latest/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/latest/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -716,7 +716,7 @@

Source code for doctr.models.detection.linknet.tensorflow

- + diff --git a/latest/_modules/doctr/models/detection/zoo.html b/latest/_modules/doctr/models/detection/zoo.html index 312f4584ab..3651c4e2d3 100644 --- a/latest/_modules/doctr/models/detection/zoo.html +++ b/latest/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -450,7 +450,7 @@

Source code for doctr.models.detection.zoo

     
   
- + diff --git a/latest/_modules/doctr/models/factory/hub.html b/latest/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/latest/_modules/doctr/models/factory/hub.html +++ b/latest/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/latest/_modules/doctr/models/recognition/crnn/tensorflow.html b/latest/_modules/doctr/models/recognition/crnn/tensorflow.html index e50c245923..bc64da9a1b 100644 --- a/latest/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/latest/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -658,7 +658,7 @@

Source code for doctr.models.recognition.crnn.tensorflow

- + diff --git a/latest/_modules/doctr/models/recognition/master/tensorflow.html b/latest/_modules/doctr/models/recognition/master/tensorflow.html index 152ebb7e59..aa6aa69325 100644 --- a/latest/_modules/doctr/models/recognition/master/tensorflow.html +++ b/latest/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -655,7 +655,7 @@

Source code for doctr.models.recognition.master.tensorflow

- + diff --git a/latest/_modules/doctr/models/recognition/parseq/tensorflow.html b/latest/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/latest/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/latest/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/latest/_modules/doctr/models/recognition/sar/tensorflow.html b/latest/_modules/doctr/models/recognition/sar/tensorflow.html index 010bc2bc54..4a591e6451 100644 --- a/latest/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/latest/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -757,7 +757,7 @@

Source code for doctr.models.recognition.sar.tensorflow

- + diff --git a/latest/_modules/doctr/models/recognition/vitstr/tensorflow.html b/latest/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/latest/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/latest/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/latest/_modules/doctr/models/recognition/zoo.html b/latest/_modules/doctr/models/recognition/zoo.html index 2c47f88de4..f664304019 100644 --- a/latest/_modules/doctr/models/recognition/zoo.html +++ b/latest/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -415,7 +415,7 @@

Source code for doctr.models.recognition.zoo

   
- + diff --git a/latest/_modules/doctr/models/zoo.html b/latest/_modules/doctr/models/zoo.html index 5b22f2c79f..d459671648 100644 --- a/latest/_modules/doctr/models/zoo.html +++ b/latest/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -576,7 +576,7 @@

Source code for doctr.models.zoo

     
   
- + diff --git a/latest/_modules/doctr/transforms/modules/base.html b/latest/_modules/doctr/transforms/modules/base.html index 96ebd680b7..4596df3848 100644 --- a/latest/_modules/doctr/transforms/modules/base.html +++ b/latest/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -643,7 +643,7 @@

Source code for doctr.transforms.modules.base

- + diff --git a/latest/_modules/doctr/transforms/modules/tensorflow.html b/latest/_modules/doctr/transforms/modules/tensorflow.html index 0e18bcc922..acbbe96225 100644 --- a/latest/_modules/doctr/transforms/modules/tensorflow.html +++ b/latest/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -956,7 +956,7 @@

Source code for doctr.transforms.modules.tensorflow

- + diff --git a/latest/_modules/doctr/utils/metrics.html b/latest/_modules/doctr/utils/metrics.html index d35d7e9672..8a37d5949a 100644 --- a/latest/_modules/doctr/utils/metrics.html +++ b/latest/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -936,7 +936,7 @@

Source code for doctr.utils.metrics

     
   
- + diff --git a/latest/_modules/doctr/utils/visualization.html b/latest/_modules/doctr/utils/visualization.html index e608d492a4..c818be6d7b 100644 --- a/latest/_modules/doctr/utils/visualization.html +++ b/latest/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -720,7 +720,7 @@

Source code for doctr.utils.visualization

     
   
- + diff --git a/latest/_modules/index.html b/latest/_modules/index.html index 758ef41bd0..5793c44f20 100644 --- a/latest/_modules/index.html +++ b/latest/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -378,7 +378,7 @@

All modules for which code is available

- + diff --git a/latest/_sources/getting_started/installing.rst.txt b/latest/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/latest/_sources/getting_started/installing.rst.txt +++ b/latest/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/latest/_static/basic.css b/latest/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/latest/_static/basic.css +++ b/latest/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/latest/_static/doctools.js b/latest/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/latest/_static/doctools.js +++ b/latest/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/latest/_static/language_data.js b/latest/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/latest/_static/language_data.js +++ b/latest/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/latest/_static/searchtools.js b/latest/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/latest/_static/searchtools.js +++ b/latest/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/latest/changelog.html b/latest/changelog.html index ac81a6f231..fc45a50384 100644 --- a/latest/changelog.html +++ b/latest/changelog.html @@ -14,7 +14,7 @@ - + Changelog - docTR documentation @@ -446,7 +446,7 @@

v0.1.0 (2021-03-05) - + diff --git a/latest/community/resources.html b/latest/community/resources.html index 2564037893..9a1988258c 100644 --- a/latest/community/resources.html +++ b/latest/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/latest/contributing/code_of_conduct.html b/latest/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/latest/contributing/code_of_conduct.html +++ b/latest/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/latest/contributing/contributing.html b/latest/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/latest/contributing/contributing.html +++ b/latest/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/latest/genindex.html b/latest/genindex.html index cbb43f08d8..21520455b4 100644 --- a/latest/genindex.html +++ b/latest/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -756,7 +756,7 @@

W

- + diff --git a/latest/getting_started/installing.html b/latest/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/latest/getting_started/installing.html +++ b/latest/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/latest/index.html b/latest/index.html index 76509686f5..3a06afc6d9 100644 --- a/latest/index.html +++ b/latest/index.html @@ -14,7 +14,7 @@ - + docTR documentation @@ -445,7 +445,7 @@

Supported datasets - + diff --git a/latest/modules/contrib.html b/latest/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/latest/modules/contrib.html +++ b/latest/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/latest/modules/datasets.html b/latest/modules/datasets.html index 456e10b172..380a986793 100644 --- a/latest/modules/datasets.html +++ b/latest/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1081,7 +1081,7 @@

Returns: - + diff --git a/latest/modules/io.html b/latest/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/latest/modules/io.html +++ b/latest/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/latest/modules/models.html b/latest/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/latest/modules/models.html +++ b/latest/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/latest/modules/transforms.html b/latest/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/latest/modules/transforms.html +++ b/latest/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/latest/modules/utils.html b/latest/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/latest/modules/utils.html +++ b/latest/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/latest/notebooks.html b/latest/notebooks.html index f97771aebb..d36539f59e 100644 --- a/latest/notebooks.html +++ b/latest/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/latest/search.html b/latest/search.html index 82b8bd6950..d050f5eac7 100644 --- a/latest/search.html +++ b/latest/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -340,7 +340,7 @@ - + diff --git a/latest/searchindex.js b/latest/searchindex.js index bfa546d0e9..6f154115ab 100644 --- a/latest/searchindex.js +++ b/latest/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [4, 10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/latest/using_doctr/custom_models_training.html b/latest/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/latest/using_doctr/custom_models_training.html +++ b/latest/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/latest/using_doctr/running_on_aws.html b/latest/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/latest/using_doctr/running_on_aws.html +++ b/latest/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/latest/using_doctr/sharing_models.html b/latest/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/latest/using_doctr/sharing_models.html +++ b/latest/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/latest/using_doctr/using_contrib_modules.html b/latest/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/latest/using_doctr/using_contrib_modules.html +++ b/latest/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/latest/using_doctr/using_datasets.html b/latest/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/latest/using_doctr/using_datasets.html +++ b/latest/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/latest/using_doctr/using_model_export.html b/latest/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/latest/using_doctr/using_model_export.html +++ b/latest/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/latest/using_doctr/using_models.html b/latest/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/latest/using_doctr/using_models.html +++ b/latest/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/modules/contrib.html b/modules/contrib.html index 22b0c508a6..b8878635b6 100644 --- a/modules/contrib.html +++ b/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -376,7 +376,7 @@

Supported contribution modules - + diff --git a/modules/datasets.html b/modules/datasets.html index 0fe4b78d48..dfcacbc96e 100644 --- a/modules/datasets.html +++ b/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1077,7 +1077,7 @@

Returns: - + diff --git a/modules/io.html b/modules/io.html index 924d292c59..77e9e017bf 100644 --- a/modules/io.html +++ b/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -756,7 +756,7 @@

Returns: - + diff --git a/modules/models.html b/modules/models.html index bf45d11a71..f4a9833365 100644 --- a/modules/models.html +++ b/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1598,7 +1598,7 @@

Args: - + diff --git a/modules/transforms.html b/modules/transforms.html index 6d77d16e7b..bc254c867b 100644 --- a/modules/transforms.html +++ b/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -831,7 +831,7 @@

Args:< - + diff --git a/modules/utils.html b/modules/utils.html index 3dd3ecbd96..6784d81f6f 100644 --- a/modules/utils.html +++ b/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -711,7 +711,7 @@

Args: - + diff --git a/notebooks.html b/notebooks.html index f3ea994e49..647f73d4eb 100644 --- a/notebooks.html +++ b/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -387,7 +387,7 @@

docTR Notebooks - + diff --git a/search.html b/search.html index f0693e2c97..0e0da5efb3 100644 --- a/search.html +++ b/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -336,7 +336,7 @@ - + diff --git a/searchindex.js b/searchindex.js index 8598997441..df18967072 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[13, null]], "Advanced options": [[18, "advanced-options"]], "Args:": [[6, "args"], [6, "id4"], [6, "id7"], [6, "id10"], [6, "id13"], [6, "id16"], [6, "id19"], [6, "id22"], [6, "id25"], [6, "id29"], [6, "id32"], [6, "id37"], [6, "id40"], [6, "id46"], [6, "id49"], [6, "id50"], [6, "id51"], [6, "id54"], [6, "id57"], [6, "id60"], [6, "id61"], [7, "args"], [7, "id2"], [7, "id3"], [7, "id4"], [7, "id5"], [7, "id6"], [7, "id7"], [7, "id10"], [7, "id12"], [7, "id14"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id28"], [8, "args"], [8, "id3"], [8, "id8"], [8, "id13"], [8, "id17"], [8, "id21"], [8, "id26"], [8, "id31"], [8, "id36"], [8, "id41"], [8, "id46"], [8, "id50"], [8, "id54"], [8, "id59"], [8, "id63"], [8, "id68"], [8, "id73"], [8, "id77"], [8, "id81"], [8, "id85"], [8, "id90"], [8, "id95"], [8, "id99"], [8, "id104"], [8, "id109"], [8, "id114"], [8, "id119"], [8, "id123"], [8, "id127"], [8, "id132"], [8, "id137"], [8, "id142"], [8, "id146"], [8, "id150"], [8, "id155"], [8, "id159"], [8, "id163"], [8, "id167"], [8, "id169"], [8, "id171"], [8, "id173"], [9, "args"], [9, "id1"], [9, "id2"], [9, "id3"], [9, "id4"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"], [9, "id12"], [9, "id13"], [9, "id14"], [9, "id15"], [9, "id16"], [9, "id17"], [9, "id18"], [9, "id19"], [10, "args"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"]], "Artefact": [[7, "artefact"]], "ArtefactDetection": [[15, "artefactdetection"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[16, "available-datasets"]], "Available architectures": [[18, "available-architectures"], [18, "id1"], [18, "id2"]], "Available contribution modules": [[15, "available-contribution-modules"]], "Block": [[7, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[16, null]], "Choosing the right model": [[18, null]], "Classification": [[14, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[9, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[6, "custom-dataset-loader"]], "Custom orientation classification models": [[12, "custom-orientation-classification-models"]], "Data Loading": [[16, "data-loading"]], "Dataloader": [[6, "dataloader"]], "Detection": [[14, "detection"], [16, "detection"]], "Detection predictors": [[18, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[7, "document"]], "Document structure": [[7, "document-structure"]], "End-to-End OCR": [[18, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[17, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[7, "file-reading"]], "Half-precision": [[17, "half-precision"]], "Installation": [[3, null]], "Integrate contributions into your pipeline": [[15, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[7, "line"]], "Loading from Huggingface Hub": [[14, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[12, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[12, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[17, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[14, "naming-conventions"]], "OCR": [[16, "ocr"]], "Object Detection": [[16, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[7, "page"]], "Preparing your model for inference": [[17, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[14, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[14, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[14, "recognition"], [16, "recognition"]], "Recognition predictors": [[18, "recognition-predictors"]], "Returns:": [[6, "returns"], [7, "returns"], [7, "id11"], [7, "id13"], [7, "id15"], [7, "id19"], [7, "id23"], [7, "id27"], [7, "id31"], [8, "returns"], [8, "id6"], [8, "id11"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id29"], [8, "id34"], [8, "id39"], [8, "id44"], [8, "id49"], [8, "id53"], [8, "id57"], [8, "id62"], [8, "id66"], [8, "id71"], [8, "id76"], [8, "id80"], [8, "id84"], [8, "id88"], [8, "id93"], [8, "id98"], [8, "id102"], [8, "id107"], [8, "id112"], [8, "id117"], [8, "id122"], [8, "id126"], [8, "id130"], [8, "id135"], [8, "id140"], [8, "id145"], [8, "id149"], [8, "id153"], [8, "id158"], [8, "id162"], [8, "id166"], [8, "id168"], [8, "id170"], [8, "id172"], [10, "returns"]], "Scope": [[1, "scope"]], "Share your model with the community": [[14, null]], "Supported Vocabs": [[6, "supported-vocabs"]], "Supported contribution modules": [[5, "supported-contribution-modules"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[9, "supported-transformations"]], "Synthetic dataset generator": [[6, "synthetic-dataset-generator"], [16, "synthetic-dataset-generator"]], "Task evaluation": [[10, "task-evaluation"]], "Text Detection": [[18, "text-detection"]], "Text Recognition": [[18, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[12, null]], "Two-stage approaches": [[18, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[16, "use-your-own-datasets"]], "Using your ONNX exported model": [[17, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[3, "via-conda-only-for-linux"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[10, "visualization"]], "What should I do with the output?": [[18, "what-should-i-do-with-the-output"]], "Word": [[7, "word"]], "docTR Notebooks": [[11, null]], "docTR Vocabs": [[6, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.contrib": [[5, null]], "doctr.datasets": [[6, null], [6, "datasets"]], "doctr.io": [[7, null]], "doctr.models": [[8, null]], "doctr.models.classification": [[8, "doctr-models-classification"]], "doctr.models.detection": [[8, "doctr-models-detection"]], "doctr.models.factory": [[8, "doctr-models-factory"]], "doctr.models.recognition": [[8, "doctr-models-recognition"]], "doctr.models.zoo": [[8, "doctr-models-zoo"]], "doctr.transforms": [[9, null]], "doctr.utils": [[10, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[7, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[7, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[9, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[6, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[9, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[9, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[6, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[8, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[6, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[8, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[8, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[7, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[8, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[6, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[6, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[7, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[7, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[6, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[6, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[9, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[9, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[6, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[6, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[6, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[6, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[6, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[8, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[9, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[7, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[8, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[6, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[9, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[8, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[6, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[9, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[7, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[8, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[9, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[9, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[9, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[9, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[9, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[9, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[9, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[9, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[9, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[9, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[9, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[9, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[7, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[7, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[7, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[7, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[6, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[9, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[7, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[7, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[6, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[10, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[10, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[10, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[10, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[6, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[6, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[6, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[9, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[10, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[10, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[10, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[10, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[10, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[8, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[8, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[6, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[7, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[6, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[6, 0, 1, "", "CORD"], [6, 0, 1, "", "CharacterGenerator"], [6, 0, 1, "", "DetectionDataset"], [6, 0, 1, "", "DocArtefacts"], [6, 0, 1, "", "FUNSD"], [6, 0, 1, "", "IC03"], [6, 0, 1, "", "IC13"], [6, 0, 1, "", "IIIT5K"], [6, 0, 1, "", "IIITHWS"], [6, 0, 1, "", "IMGUR5K"], [6, 0, 1, "", "MJSynth"], [6, 0, 1, "", "OCRDataset"], [6, 0, 1, "", "RecognitionDataset"], [6, 0, 1, "", "SROIE"], [6, 0, 1, "", "SVHN"], [6, 0, 1, "", "SVT"], [6, 0, 1, "", "SynthText"], [6, 0, 1, "", "WILDRECEIPT"], [6, 0, 1, "", "WordGenerator"], [6, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[6, 0, 1, "", "DataLoader"]], "doctr.io": [[7, 0, 1, "", "Artefact"], [7, 0, 1, "", "Block"], [7, 0, 1, "", "Document"], [7, 0, 1, "", "DocumentFile"], [7, 0, 1, "", "Line"], [7, 0, 1, "", "Page"], [7, 0, 1, "", "Word"], [7, 1, 1, "", "decode_img_as_tensor"], [7, 1, 1, "", "read_html"], [7, 1, 1, "", "read_img_as_numpy"], [7, 1, 1, "", "read_img_as_tensor"], [7, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[7, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[7, 2, 1, "", "from_images"], [7, 2, 1, "", "from_pdf"], [7, 2, 1, "", "from_url"]], "doctr.io.Page": [[7, 2, 1, "", "show"]], "doctr.models": [[8, 1, 1, "", "kie_predictor"], [8, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[8, 1, 1, "", "crop_orientation_predictor"], [8, 1, 1, "", "magc_resnet31"], [8, 1, 1, "", "mobilenet_v3_large"], [8, 1, 1, "", "mobilenet_v3_large_r"], [8, 1, 1, "", "mobilenet_v3_small"], [8, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [8, 1, 1, "", "mobilenet_v3_small_page_orientation"], [8, 1, 1, "", "mobilenet_v3_small_r"], [8, 1, 1, "", "page_orientation_predictor"], [8, 1, 1, "", "resnet18"], [8, 1, 1, "", "resnet31"], [8, 1, 1, "", "resnet34"], [8, 1, 1, "", "resnet50"], [8, 1, 1, "", "textnet_base"], [8, 1, 1, "", "textnet_small"], [8, 1, 1, "", "textnet_tiny"], [8, 1, 1, "", "vgg16_bn_r"], [8, 1, 1, "", "vit_b"], [8, 1, 1, "", "vit_s"]], "doctr.models.detection": [[8, 1, 1, "", "db_mobilenet_v3_large"], [8, 1, 1, "", "db_resnet50"], [8, 1, 1, "", "detection_predictor"], [8, 1, 1, "", "fast_base"], [8, 1, 1, "", "fast_small"], [8, 1, 1, "", "fast_tiny"], [8, 1, 1, "", "linknet_resnet18"], [8, 1, 1, "", "linknet_resnet34"], [8, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[8, 1, 1, "", "from_hub"], [8, 1, 1, "", "login_to_hub"], [8, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[8, 1, 1, "", "crnn_mobilenet_v3_large"], [8, 1, 1, "", "crnn_mobilenet_v3_small"], [8, 1, 1, "", "crnn_vgg16_bn"], [8, 1, 1, "", "master"], [8, 1, 1, "", "parseq"], [8, 1, 1, "", "recognition_predictor"], [8, 1, 1, "", "sar_resnet31"], [8, 1, 1, "", "vitstr_base"], [8, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[9, 0, 1, "", "ChannelShuffle"], [9, 0, 1, "", "ColorInversion"], [9, 0, 1, "", "Compose"], [9, 0, 1, "", "GaussianBlur"], [9, 0, 1, "", "GaussianNoise"], [9, 0, 1, "", "LambdaTransformation"], [9, 0, 1, "", "Normalize"], [9, 0, 1, "", "OneOf"], [9, 0, 1, "", "RandomApply"], [9, 0, 1, "", "RandomBrightness"], [9, 0, 1, "", "RandomContrast"], [9, 0, 1, "", "RandomCrop"], [9, 0, 1, "", "RandomGamma"], [9, 0, 1, "", "RandomHorizontalFlip"], [9, 0, 1, "", "RandomHue"], [9, 0, 1, "", "RandomJpegQuality"], [9, 0, 1, "", "RandomResize"], [9, 0, 1, "", "RandomRotate"], [9, 0, 1, "", "RandomSaturation"], [9, 0, 1, "", "RandomShadow"], [9, 0, 1, "", "Resize"], [9, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[10, 0, 1, "", "DetectionMetric"], [10, 0, 1, "", "LocalizationConfusion"], [10, 0, 1, "", "OCRMetric"], [10, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.visualization": [[10, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 7, 8, 10, 14, 17], "0": [1, 3, 6, 9, 10, 12, 15, 16, 18], "00": 18, "01": 18, "0123456789": 6, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 6, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 6, "02562": 8, "03": 18, "035": 18, "0361328125": 18, "04": 18, "05": 18, "06": 18, "06640625": 18, "07": 18, "08": [9, 18], "09": 18, "0966796875": 18, "1": [6, 7, 8, 9, 10, 12, 16, 18], "10": [6, 10, 18], "100": [6, 9, 10, 16, 18], "1000": 18, "101": 6, "1024": [8, 12, 18], "104": 6, "106": 6, "108": 6, "1095": 16, "11": 18, "110": 10, "1107": 16, "114": 6, "115": 6, "1156": 16, "116": 6, "118": 6, "11800h": 18, "11th": 18, "12": 18, "120": 6, "123": 6, "126": 6, "1268": 16, "128": [8, 12, 17, 18], "13": 18, "130": 6, "13068": 16, "131": 6, "1337891": 16, "1357421875": 18, "1396484375": 18, "14": 18, "1420": 18, "14470v1": 6, "149": 16, "15": 18, "150": [10, 18], "1552": 18, "16": [8, 17, 18], "1630859375": 18, "1684": 18, "16x16": 8, "17": 18, "1778": 18, "1782": 18, "18": [8, 18], "185546875": 18, "1900": 18, "1910": 8, "19342": 16, "19370": 16, "195": 6, "19598": 16, "199": 18, "1999": 18, "2": [3, 4, 6, 7, 9, 15, 18], "20": 18, "200": 10, "2000": 16, "2003": [4, 6], "2012": 6, "2013": [4, 6], "2015": 6, "2019": 4, "207901": 16, "21": 18, "2103": 6, "2186": 16, "21888": 16, "22": 18, "224": [8, 9], "225": 9, "22672": 16, "229": [9, 16], "23": 18, "233": 16, "236": 6, "24": 18, "246": 16, "249": 16, "25": 18, "2504": 18, "255": [7, 8, 9, 10, 18], "256": 8, "257": 16, "26": 18, "26032": 16, "264": 12, "27": 18, "2700": 16, "2710": 18, "2749": 12, "28": 18, "287": 12, "29": 18, "296": 12, "299": 12, "2d": 18, "3": [3, 4, 7, 8, 9, 10, 17, 18], "30": 18, "300": 16, "3000": 16, "301": 12, "30595": 18, "30ghz": 18, "31": 8, "32": [6, 8, 9, 12, 16, 17, 18], "3232421875": 18, "33": [9, 18], "33402": 16, "33608": 16, "34": [8, 18], "340": 18, "3456": 18, "3515625": 18, "36": 18, "360": 16, "37": [6, 18], "38": 18, "39": 18, "4": [8, 9, 10, 18], "40": 18, "406": 9, "41": 18, "42": 18, "43": 18, "44": 18, "45": 18, "456": 9, "46": 18, "47": 18, "472": 16, "48": [6, 18], "485": 9, "49": 18, "49377": 16, "5": [6, 9, 10, 15, 18], "50": [8, 16, 18], "51": 18, "51171875": 18, "512": 8, "52": [6, 18], "529": 18, "53": 18, "54": 18, "540": 18, "5478515625": 18, "55": 18, "56": 18, "57": 18, "58": [6, 18], "580": 18, "5810546875": 18, "583": 18, "59": 18, "597": 18, "5k": [4, 6], "5m": 18, "6": [9, 18], "60": 9, "600": [8, 10, 18], "61": 18, "62": 18, "626": 16, "63": 18, "64": [8, 9, 18], "641": 18, "647": 16, "65": 18, "66": 18, "67": 18, "68": 18, "69": 18, "693": 12, "694": 12, "695": 12, "6m": 18, "7": 18, "70": [6, 10, 18], "707470": 16, "71": [6, 18], "7100000": 16, "7141797": 16, "7149": 16, "72": 18, "72dpi": 7, "73": 18, "73257": 16, "74": 18, "75": [9, 18], "7581382": 16, "76": 18, "77": 18, "772": 12, "772875": 16, "78": 18, "785": 12, "79": 18, "793533": 16, "796": 16, "798": 12, "7m": 18, "8": [8, 9, 18], "80": 18, "800": [8, 10, 16, 18], "81": 18, "82": 18, "83": 18, "84": 18, "849": 16, "85": 18, "8564453125": 18, "857": 18, "85875": 16, "86": 18, "8603515625": 18, "87": 18, "8707": 16, "88": 18, "89": 18, "9": [3, 9, 18], "90": 18, "90k": 6, "90kdict32px": 6, "91": 18, "914085328578949": 18, "92": 18, "93": 18, "94": [6, 18], "95": [10, 18], "9578408598899841": 18, "96": 18, "97": 18, "98": 18, "99": 18, "9949972033500671": 18, "A": [1, 2, 4, 6, 7, 8, 11, 17], "As": 2, "Be": 18, "Being": 1, "By": 13, "For": [1, 2, 3, 12, 18], "If": [2, 7, 8, 12, 18], "In": [2, 6, 16], "It": [9, 14, 15, 17], "Its": [4, 8], "No": [1, 18], "Of": 6, "Or": [15, 17], "The": [1, 2, 6, 7, 10, 13, 15, 16, 17, 18], "Then": 8, "To": [2, 3, 13, 14, 15, 17, 18], "_": [1, 6, 8], "__call__": 18, "_build": 2, "_i": 10, "ab": 6, "abc": 17, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 6, "abdef": [6, 16], "abl": [16, 18], "about": [1, 16, 18], "abov": 18, "abstractdataset": 6, "abus": 1, "accept": 1, "access": [4, 7, 16, 18], "account": [1, 14], "accur": 18, "accuraci": 10, "achiev": 17, "act": 1, "action": 1, "activ": 4, "ad": [2, 8, 9], "adapt": 1, "add": [9, 10, 14, 18], "add_hook": 18, "add_label": 10, "addit": [2, 3, 7, 15, 18], "addition": [2, 18], "address": [1, 7], "adjust": 9, "advanc": 1, "advantag": 17, "advis": 2, "aesthet": [4, 6], "affect": 1, "after": [14, 18], "ag": 1, "again": 8, "aggreg": [10, 16], "aggress": 1, "align": [1, 7, 9], "all": [1, 2, 5, 6, 7, 9, 10, 15, 16, 18], "allow": [1, 17], "along": 18, "alreadi": [2, 17], "also": [1, 8, 14, 15, 16, 18], "alwai": 16, "an": [1, 2, 4, 6, 7, 8, 10, 15, 17, 18], "analysi": [7, 15], "ancient_greek": 6, "angl": [7, 9], "ani": [1, 6, 7, 8, 9, 10, 17, 18], "annot": 6, "anot": 16, "anoth": [8, 12, 16], "answer": 1, "anyascii": 10, "anyon": 4, "anyth": 15, "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 6, 9], "applic": [4, 8], "appoint": 1, "appreci": 14, "appropri": [1, 2, 18], "ar": [1, 2, 3, 5, 6, 7, 9, 10, 11, 15, 16, 18], "arab": 6, "arabic_diacrit": 6, "arabic_lett": 6, "arabic_punctu": 6, "arbitrarili": [4, 8], "arch": [8, 14], "architectur": [4, 8, 14, 15], "area": 18, "argument": [6, 7, 8, 10, 12, 18], "around": 1, "arrai": [7, 9, 10], "art": [4, 15], "artefact": [10, 15, 18], "artefact_typ": 7, "artifici": [4, 6], "arxiv": [6, 8], "asarrai": 10, "ascii_lett": 6, "aspect": [4, 8, 9, 18], "assess": 10, "assign": 10, "associ": 7, "assum": 8, "assume_straight_pag": [8, 12, 18], "astyp": [8, 10, 18], "attack": 1, "attend": [4, 8], "attent": [1, 8], "autom": 4, "automat": 18, "autoregress": [4, 8], "avail": [1, 4, 5, 9], "averag": [9, 18], "avoid": [1, 3], "aw": [4, 18], "awar": 18, "azur": 18, "b": [8, 10, 18], "b_j": 10, "back": 2, "backbon": 8, "backend": 18, "background": 16, "bangla": 6, "bar": 15, "bar_cod": 16, "base": [4, 8, 15], "baselin": [4, 8, 18], "batch": [6, 8, 9, 15, 16, 18], "batch_siz": [6, 12, 15, 16, 17], "bblanchon": 3, "bbox": 18, "becaus": 13, "been": [2, 10, 16, 18], "befor": [6, 8, 9, 18], "begin": 10, "behavior": [1, 18], "being": [10, 18], "belong": 18, "benchmark": 18, "best": 1, "better": [11, 18], "between": [9, 10, 18], "bgr": 7, "bilinear": 9, "bin_thresh": 18, "binar": [4, 8, 18], "binari": [7, 17, 18], "bit": 17, "block": [10, 18], "block_1_1": 18, "blur": 9, "bmvc": 6, "bn": 14, "bodi": [1, 18], "bool": [6, 7, 8, 9, 10], "boolean": [8, 18], "both": [4, 6, 9, 16, 18], "bottom": [8, 18], "bound": [6, 7, 8, 9, 10, 15, 16, 18], "box": [6, 7, 8, 9, 10, 15, 16, 18], "box_thresh": 18, "bright": 9, "browser": [2, 4], "build": [2, 3, 17], "built": 2, "byte": [7, 18], "c": [3, 7, 10], "c_j": 10, "cach": [2, 6, 13], "cache_sampl": 6, "call": 17, "callabl": [6, 9], "can": [2, 3, 12, 13, 14, 15, 16, 18], "capabl": [2, 11, 18], "case": [6, 10], "cf": 18, "cfg": 18, "challeng": 6, "challenge2_test_task12_imag": 6, "challenge2_test_task1_gt": 6, "challenge2_training_task12_imag": 6, "challenge2_training_task1_gt": 6, "chang": [13, 18], "channel": [1, 2, 7, 9], "channel_prior": 3, "channelshuffl": 9, "charact": [4, 6, 7, 10, 16, 18], "charactergener": [6, 16], "characterist": 1, "charg": 18, "charset": 18, "chart": 7, "check": [2, 14, 18], "checkpoint": 8, "chip": 3, "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 6, 7, 9, 10, 18], "class_nam": 12, "classif": [16, 18], "classmethod": 7, "clear": 2, "clone": 3, "close": 2, "co": 14, "code": [4, 7, 15], "codecov": 2, "colab": 11, "collate_fn": 6, "collect": [7, 15], "color": 9, "colorinvers": 9, "column": 7, "com": [1, 3, 7, 8, 14], "combin": 18, "command": [2, 15], "comment": 1, "commit": 1, "common": [1, 9, 10, 17], "commun": 1, "compar": 4, "comparison": [10, 18], "competit": 6, "compil": [11, 18], "complaint": 1, "complementari": 10, "complet": 2, "compon": 18, "compos": [6, 18], "comprehens": 18, "comput": [6, 10, 17, 18], "conf_threshold": 15, "confid": [7, 18], "config": [3, 8], "configur": 8, "confus": 10, "consecut": [9, 18], "consequ": 1, "consid": [1, 2, 6, 7, 10, 18], "consist": 18, "consolid": [4, 6], "constant": 9, "construct": 1, "contact": 1, "contain": [5, 6, 11, 16, 18], "content": [6, 7, 18], "context": 8, "contib": 3, "continu": 1, "contrast": 9, "contrast_factor": 9, "contrib": [3, 15], "contribut": 1, "contributor": 2, "convers": 7, "convert": [7, 9], "convolut": 8, "coordin": [7, 18], "cord": [4, 6, 16, 18], "core": [10, 18], "corner": 18, "correct": 9, "correspond": [3, 7, 9, 18], "could": [1, 15], "counterpart": 10, "cover": 2, "coverag": 2, "cpu": [4, 12, 17], "creat": 14, "crnn": [4, 8, 14], "crnn_mobilenet_v3_larg": [8, 14, 18], "crnn_mobilenet_v3_smal": [8, 17, 18], "crnn_vgg16_bn": [8, 12, 14, 18], "crop": [7, 8, 9, 12, 16, 18], "crop_orient": [7, 18], "crop_orientation_predictor": [8, 12], "crop_param": 12, "cuda": 17, "currenc": 6, "current": [2, 12, 18], "custom": [14, 15, 17, 18], "custom_crop_orientation_model": 12, "custom_page_orientation_model": 12, "customhook": 18, "cvit": 4, "czczup": 8, "czech": 6, "d": [6, 16], "danish": 6, "data": [4, 6, 7, 9, 10, 12, 14], "dataload": 16, "dataset": [8, 12, 18], "dataset_info": 6, "date": [12, 18], "db": 14, "db_mobilenet_v3_larg": [8, 14, 18], "db_resnet34": 18, "db_resnet50": [8, 12, 14, 18], "dbnet": [4, 8], "deal": [11, 18], "decis": 1, "decod": 7, "decode_img_as_tensor": 7, "dedic": 17, "deem": 1, "deep": [8, 18], "def": 18, "default": [3, 7, 12, 13, 18], "defer": 16, "defin": [10, 17], "degre": [7, 9, 18], "degress": 7, "delet": 2, "delimit": 18, "delta": 9, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4, 18], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": 8, "descript": 11, "design": 9, "desir": 7, "det_arch": [8, 12, 14, 17], "det_b": 18, "det_model": [12, 14, 17], "det_param": 12, "det_predictor": [12, 18], "detail": [12, 18], "detect": [6, 7, 10, 11, 12, 15], "detect_languag": 8, "detect_orient": [8, 12, 18], "detection_predictor": [8, 18], "detection_task": [6, 16], "detectiondataset": [6, 16], "detectionmetr": 10, "detectionpredictor": [8, 12], "detector": [4, 8, 15], "deterior": 8, "determin": 1, "dev": [2, 13], "develop": 3, "deviat": 9, "devic": 17, "dict": [7, 10, 18], "dictionari": [7, 10], "differ": 1, "differenti": [4, 8], "digit": [4, 6, 16], "dimens": [7, 10, 18], "dimension": 9, "direct": 6, "directli": [14, 18], "directori": [2, 13], "disabl": [1, 13, 18], "disable_crop_orient": 18, "disable_page_orient": 18, "disclaim": 18, "discuss": 2, "disparag": 1, "displai": [7, 10], "display_artefact": 10, "distribut": 9, "div": 18, "divers": 1, "divid": 7, "do": [2, 3, 8], "doc": [2, 7, 15, 17, 18], "docartefact": [6, 16], "docstr": 2, "doctr": [3, 12, 13, 14, 15, 16, 17, 18], "doctr_cache_dir": 13, "doctr_multiprocessing_dis": 13, "document": [6, 8, 10, 11, 12, 15, 16, 17, 18], "documentbuild": 18, "documentfil": [7, 12, 14, 15, 17], "doesn": 17, "don": [12, 18], "done": 9, "download": [6, 16], "downsiz": 8, "draw": 9, "drop": 6, "drop_last": 6, "dtype": [7, 8, 9, 10, 17], "dual": [4, 6], "dummi": 14, "dummy_img": 18, "dummy_input": 17, "dure": 1, "dutch": 6, "dynam": [6, 15], "dynamic_seq_length": 6, "e": [1, 2, 3, 7, 8], "each": [4, 6, 7, 8, 9, 10, 16, 18], "eas": 2, "easi": [4, 10, 14, 17], "easili": [7, 10, 12, 14, 16, 18], "econom": 1, "edit": 1, "educ": 1, "effect": 18, "effici": [2, 4, 6, 8], "either": [10, 18], "element": [6, 7, 8, 18], "els": [2, 15], "email": 1, "empathi": 1, "en": 18, "enabl": [6, 7], "enclos": 7, "encod": [4, 6, 7, 8, 18], "encode_sequ": 6, "encount": 2, "encrypt": 7, "end": [4, 6, 8, 10], "english": [6, 16], "enough": [2, 18], "ensur": 2, "entri": 6, "environ": [1, 13], "eo": 6, "equiv": 18, "estim": 8, "etc": [7, 15], "ethnic": 1, "evalu": [16, 18], "event": 1, "everyon": 1, "everyth": [2, 18], "exact": [10, 18], "exampl": [1, 2, 4, 6, 8, 14, 18], "exchang": 17, "execut": 18, "exist": 14, "expand": 9, "expect": [7, 9, 10], "experi": 1, "explan": [1, 18], "explicit": 1, "exploit": [4, 8], "export": [7, 8, 10, 11, 15, 18], "export_as_straight_box": [8, 18], "export_as_xml": 18, "export_model_to_onnx": 17, "express": [1, 9], "extens": 7, "extern": [1, 16], "extract": [4, 6], "extractor": 8, "f_": 10, "f_a": 10, "factor": 9, "fair": 1, "fairli": 1, "fals": [6, 7, 8, 9, 10, 12, 18], "faq": 1, "fascan": 14, "fast": [4, 6, 8], "fast_bas": [8, 18], "fast_smal": [8, 18], "fast_tini": [8, 18], "faster": [4, 8, 17], "fasterrcnn_mobilenet_v3_large_fpn": 8, "favorit": 18, "featur": [3, 8, 10, 11, 12, 15], "feedback": 1, "feel": [2, 14], "felix92": 14, "few": [17, 18], "figsiz": 10, "figur": [10, 15], "file": [2, 6], "final": 8, "find": [2, 16], "finnish": 6, "first": [2, 6], "firsthand": 6, "fit": [8, 18], "flag": 18, "flip": 9, "float": [7, 9, 10, 17], "float32": [7, 8, 9, 17], "fn": 9, "focu": 14, "focus": [1, 6], "folder": 6, "follow": [1, 2, 3, 6, 9, 10, 12, 13, 14, 15, 18], "font": 6, "font_famili": 6, "foral": 10, "forc": 2, "forg": 3, "form": [4, 6, 18], "format": [7, 10, 12, 16, 17, 18], "forpost": [4, 6], "forum": 2, "fp16": 17, "frac": 10, "framework": [3, 14, 16, 18], "free": [1, 2, 14], "french": [6, 12, 14, 18], "friendli": 4, "from": [1, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18], "from_hub": [8, 14], "from_imag": [7, 14, 15, 17], "from_pdf": 7, "from_url": 7, "full": [6, 10, 18], "function": [6, 9, 10, 15], "funsd": [4, 6, 16, 18], "further": 16, "futur": 6, "g": [7, 8], "g_": 10, "g_x": 10, "gamma": 9, "gaussian": 9, "gaussianblur": 9, "gaussiannois": 9, "gen": 18, "gender": 1, "gener": [2, 4, 7, 8], "generic_cyrillic_lett": 6, "geometri": [4, 7, 18], "geq": 10, "german": [6, 12, 14], "get": [17, 18], "git": 14, "github": [2, 3, 8, 14], "give": [1, 15], "given": [6, 7, 9, 10, 18], "global": 8, "go": 18, "good": 17, "googl": 2, "googlevis": 4, "gpu": [4, 15, 17], "gracefulli": 1, "graph": [4, 6, 7], "grayscal": 9, "ground": 10, "groung": 10, "group": [4, 18], "gt": 10, "gt_box": 10, "gt_label": 10, "guid": 2, "guidanc": 16, "gvision": 18, "h": [7, 8, 9], "h_": 10, "ha": [2, 6, 10, 16], "handl": [11, 16, 18], "handwrit": 6, "handwritten": 16, "harass": 1, "hardwar": 18, "harm": 1, "hat": 10, "have": [1, 2, 10, 12, 14, 16, 17, 18], "head": [8, 18], "healthi": 1, "hebrew": 6, "height": [7, 9], "hello": [10, 18], "help": 17, "here": [5, 9, 11, 15, 16, 18], "hf": 8, "hf_hub_download": 8, "high": 7, "higher": [3, 6, 18], "hindi": 6, "hindi_digit": 6, "hocr": 18, "hook": 18, "horizont": [7, 9, 18], "hous": 6, "how": [2, 11, 12, 14, 16], "howev": 16, "hsv": 9, "html": [1, 2, 3, 7, 18], "http": [1, 3, 6, 7, 8, 14, 18], "hub": 8, "hue": 9, "huggingfac": 8, "hw": 6, "i": [1, 2, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17], "i7": 18, "ic03": [4, 6, 16], "ic13": [4, 6, 16], "icdar": [4, 6], "icdar2019": 6, "id": 18, "ident": 1, "identifi": 4, "iiit": [4, 6], "iiit5k": [6, 16], "iiithw": [4, 6, 16], "imag": [4, 6, 7, 8, 9, 10, 14, 15, 16, 18], "imagenet": 8, "imageri": 1, "images_90k_norm": 6, "img": [6, 9, 16, 17], "img_cont": 7, "img_fold": [6, 16], "img_path": 7, "img_transform": 6, "imgur5k": [4, 6, 16], "imgur5k_annot": 6, "imlist": 6, "impact": 1, "implement": [6, 7, 8, 9, 10, 18], "import": [6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18], "improv": 8, "inappropri": 1, "incid": 1, "includ": [1, 6, 16, 17], "inclus": 1, "increas": 9, "independ": 9, "index": [2, 7], "indic": 10, "individu": 1, "infer": [4, 8, 9, 15, 18], "inform": [1, 2, 4, 6, 16], "input": [2, 7, 8, 9, 17, 18], "input_crop": 8, "input_pag": [8, 10, 18], "input_shap": 17, "input_tensor": 8, "inspir": [1, 9], "instal": [14, 15, 17], "instanc": [1, 18], "instanti": [8, 18], "instead": [6, 7, 8], "insult": 1, "int": [6, 7, 9], "int64": 10, "integ": 10, "integr": [4, 14, 16], "intel": 18, "interact": [1, 7, 10], "interfac": [14, 17], "interoper": 17, "interpol": 9, "interpret": [6, 7], "intersect": 10, "invert": 9, "investig": 1, "invis": 1, "involv": [1, 18], "io": [12, 14, 15, 17], "iou": 10, "iou_thresh": 10, "iou_threshold": 15, "irregular": [4, 8, 16], "isn": 6, "issu": [1, 2, 14], "italian": 6, "iter": [6, 9, 16, 18], "its": [7, 8, 9, 10, 16, 18], "itself": [8, 14], "j": 10, "job": 2, "join": 2, "jpeg": 9, "jpegqual": 9, "jpg": [6, 7, 14, 17], "json": [6, 16, 18], "json_output": 18, "jump": 2, "just": 1, "kei": [4, 6], "kera": [8, 17], "kernel": [4, 8, 9], "kernel_shap": 9, "keywoard": 8, "keyword": [6, 7, 8, 10], "kie": [8, 12], "kie_predictor": [8, 12], "kiepredictor": 8, "kind": 1, "know": [2, 17], "kwarg": [6, 7, 8, 10], "l": 10, "l_j": 10, "label": [6, 10, 15, 16], "label_fil": [6, 16], "label_fold": 6, "label_path": [6, 16], "labels_path": [6, 16], "ladder": 1, "lambda": 9, "lambdatransform": 9, "lang": 18, "languag": [1, 4, 6, 7, 8, 14, 18], "larg": [8, 14], "largest": 10, "last": [3, 6], "latenc": 8, "later": 2, "latest": 18, "latin": 6, "layer": 17, "layout": 18, "lead": 1, "leader": 1, "learn": [1, 4, 8, 17, 18], "least": 3, "left": [10, 18], "legacy_french": 6, "length": [6, 18], "less": [17, 18], "level": [1, 6, 10, 18], "leverag": 11, "lf": 14, "librari": [2, 3, 11, 12], "light": 4, "lightweight": 17, "like": 1, "limits_": 10, "line": [4, 8, 10, 18], "line_1_1": 18, "link": 12, "linknet": [4, 8], "linknet_resnet18": [8, 12, 17, 18], "linknet_resnet34": [8, 17, 18], "linknet_resnet50": [8, 18], "list": [6, 7, 9, 10, 14], "ll": 10, "load": [4, 6, 8, 15, 17], "load_state_dict": 12, "load_weight": 12, "loc_pr": 18, "local": [2, 4, 6, 8, 10, 16, 18], "localis": 6, "localizationconfus": 10, "locat": [2, 7, 18], "login": 8, "login_to_hub": [8, 14], "logo": [7, 15, 16], "love": 14, "lower": [9, 10, 18], "m": [2, 10, 18], "m1": 3, "macbook": 3, "machin": 17, "made": 4, "magc_resnet31": 8, "mai": [1, 2], "mail": 1, "main": 11, "maintain": 4, "mainten": 2, "make": [1, 2, 10, 12, 13, 14, 17, 18], "mani": [16, 18], "manipul": 18, "map": [6, 8], "map_loc": 12, "master": [4, 8, 18], "match": [10, 18], "mathcal": 10, "matplotlib": [7, 10], "max": [6, 9, 10], "max_angl": 9, "max_area": 9, "max_char": [6, 16], "max_delta": 9, "max_gain": 9, "max_gamma": 9, "max_qual": 9, "max_ratio": 9, "maximum": [6, 9], "maxval": [8, 9], "mbox": 10, "mean": [9, 10, 12], "meaniou": 10, "meant": [7, 17], "measur": 18, "media": 1, "median": 8, "meet": 12, "member": 1, "memori": [13, 17], "mention": 18, "merg": 6, "messag": 2, "meta": 18, "metadata": 17, "metal": 3, "method": [7, 9, 18], "metric": [10, 18], "middl": 18, "might": [17, 18], "min": 9, "min_area": 9, "min_char": [6, 16], "min_gain": 9, "min_gamma": 9, "min_qual": 9, "min_ratio": 9, "min_val": 9, "minde": [1, 3, 4, 8], "minim": [2, 4], "minimalist": [4, 8], "minimum": [3, 6, 9, 10, 18], "minval": 9, "miss": 3, "mistak": 1, "mixed_float16": 17, "mixed_precis": 17, "mjsynth": [4, 6, 16], "mnt": 6, "mobilenet": [8, 14], "mobilenet_v3_larg": 8, "mobilenet_v3_large_r": 8, "mobilenet_v3_smal": [8, 12], "mobilenet_v3_small_crop_orient": [8, 12], "mobilenet_v3_small_page_orient": [8, 12], "mobilenet_v3_small_r": 8, "mobilenetv3": 8, "modal": [4, 6], "mode": 3, "model": [6, 10, 13, 15, 16], "model_nam": [8, 14, 17], "model_path": [15, 17], "moder": 1, "modif": 2, "modifi": [8, 13, 18], "modul": [3, 7, 8, 9, 10, 18], "more": [2, 16, 18], "most": 18, "mozilla": 1, "multi": [4, 8], "multilingu": [6, 14], "multipl": [6, 7, 9, 18], "multipli": 9, "multiprocess": 13, "my": 8, "my_awesome_model": 14, "my_hook": 18, "n": [6, 10], "name": [6, 8, 17, 18], "nation": 1, "natur": [1, 4, 6], "ndarrai": [6, 7, 9, 10], "necessari": [3, 12, 13], "need": [2, 3, 6, 10, 12, 13, 14, 15, 18], "neg": 9, "nest": 18, "network": [4, 6, 8, 17], "neural": [4, 6, 8, 17], "new": [2, 10], "next": [6, 16], "nois": 9, "noisi": [4, 6], "non": [4, 6, 7, 8, 9, 10], "none": [6, 7, 8, 9, 10, 18], "normal": [8, 9], "norwegian": 6, "note": [0, 2, 6, 8, 12, 14, 15, 17], "now": 2, "np": [8, 9, 10, 18], "num_output_channel": 9, "num_sampl": [6, 16], "number": [6, 9, 10, 18], "numpi": [7, 8, 10, 18], "o": 3, "obb": 15, "obj_detect": 14, "object": [6, 7, 10, 15, 18], "objectness_scor": [7, 18], "oblig": 1, "obtain": 18, "occupi": 17, "ocr": [4, 6, 8, 10, 14], "ocr_carea": 18, "ocr_db_crnn": 10, "ocr_lin": 18, "ocr_pag": 18, "ocr_par": 18, "ocr_predictor": [8, 12, 14, 17, 18], "ocrdataset": [6, 16], "ocrmetr": 10, "ocrpredictor": [8, 12], "ocrx_word": 18, "offens": 1, "offici": [1, 8], "offlin": 1, "offset": 9, "onc": 18, "one": [2, 6, 8, 9, 12, 14, 18], "oneof": 9, "ones": [6, 10], "onli": [2, 8, 9, 10, 12, 14, 16, 17, 18], "onlin": 1, "onnx": 15, "onnxruntim": [15, 17], "onnxtr": 17, "opac": 9, "opacity_rang": 9, "open": [1, 2, 14, 17], "opinion": 1, "optic": [4, 18], "optim": [4, 18], "option": [6, 8, 12], "order": [2, 6, 7, 9], "org": [1, 6, 8, 18], "organ": 7, "orient": [1, 7, 8, 11, 15, 18], "orientationpredictor": 8, "other": [1, 2], "otherwis": [1, 7, 10], "our": [2, 8, 18], "out": [2, 8, 9, 10, 18], "outpout": 18, "output": [7, 9, 17], "output_s": [7, 9], "outsid": 13, "over": [6, 10, 18], "overal": [1, 8], "overlai": 7, "overview": 15, "overwrit": 12, "overwritten": 14, "own": 4, "p": [9, 18], "packag": [2, 4, 10, 13, 15, 16, 17], "pad": [6, 8, 9, 18], "page": [3, 6, 8, 10, 12, 18], "page1": 7, "page2": 7, "page_1": 18, "page_idx": [7, 18], "page_orientation_predictor": [8, 12], "page_param": 12, "pair": 10, "paper": 8, "par_1_1": 18, "paragraph": 18, "paragraph_break": 18, "param": [9, 18], "paramet": [4, 7, 8, 17], "pars": [4, 6], "parseq": [4, 8, 14, 17, 18], "part": [6, 9, 18], "parti": 3, "partial": 18, "particip": 1, "pass": [6, 7, 8, 12, 18], "password": 7, "patch": [8, 10], "path": [6, 7, 15, 16, 17], "path_to_checkpoint": 12, "path_to_custom_model": 17, "path_to_pt": 12, "pattern": 1, "pdf": [7, 8, 11], "pdfpage": 7, "peopl": 1, "per": [9, 18], "perform": [4, 7, 8, 9, 10, 13, 17, 18], "period": 1, "permiss": 1, "permut": [4, 8], "persian_lett": 6, "person": [1, 16], "phase": 18, "photo": 16, "physic": [1, 7], "pick": 9, "pictur": 7, "pip": [2, 3, 15, 17], "pipelin": 18, "pixel": [7, 9, 18], "pleas": 2, "plot": 10, "plt": 10, "plug": 14, "plugin": 3, "png": 7, "point": 17, "polici": 13, "polish": 6, "polit": 1, "polygon": [6, 10, 18], "pool": 8, "portugues": 6, "posit": [1, 10], "possibl": [2, 10, 14, 18], "post": [1, 18], "postprocessor": 18, "potenti": 8, "power": 4, "ppageno": 18, "pre": [2, 8, 17], "precis": [10, 18], "pred": 10, "pred_box": 10, "pred_label": 10, "predefin": 16, "predict": [7, 8, 10, 18], "predictor": [4, 7, 8, 11, 12, 14, 17], "prefer": 16, "preinstal": 3, "preprocessor": [12, 18], "prerequisit": 14, "present": 11, "preserv": [8, 9, 18], "preserve_aspect_ratio": [7, 8, 9, 12, 18], "pretrain": [4, 8, 10, 12, 17, 18], "pretrained_backbon": [8, 12], "print": 18, "prior": 6, "privaci": 1, "privat": 1, "probabl": 9, "problem": 2, "procedur": 9, "process": [2, 4, 7, 12, 18], "processor": 18, "produc": [11, 18], "product": 17, "profession": 1, "project": [2, 16], "promptli": 1, "proper": 2, "properli": 6, "provid": [1, 2, 4, 14, 15, 16, 18], "public": [1, 4], "publicli": 18, "publish": 1, "pull": 14, "punctuat": 6, "pure": 6, "purpos": 2, "push_to_hf_hub": [8, 14], "py": 14, "pypdfium2": [3, 7], "pyplot": [7, 10], "python": [2, 15], "python3": 14, "pytorch": [3, 4, 8, 9, 12, 14, 17, 18], "q": 2, "qr": [7, 15], "qr_code": 16, "qualiti": 9, "question": 1, "quickli": 4, "quicktour": 11, "r": 18, "race": 1, "ramdisk": 6, "rand": [8, 9, 10, 17, 18], "random": [8, 9, 10, 18], "randomappli": 9, "randombright": 9, "randomcontrast": 9, "randomcrop": 9, "randomgamma": 9, "randomhorizontalflip": 9, "randomhu": 9, "randomjpegqu": 9, "randomli": 9, "randomres": 9, "randomrot": 9, "randomsatur": 9, "randomshadow": 9, "rang": 9, "rassi": 14, "ratio": [8, 9, 18], "raw": [7, 10], "re": 17, "read": [4, 6, 8], "read_html": 7, "read_img_as_numpi": 7, "read_img_as_tensor": 7, "read_pdf": 7, "readi": 17, "real": [4, 8, 9], "reason": [1, 4, 6], "rebuild": 2, "rebuilt": 2, "recal": [10, 18], "receipt": [4, 6, 18], "reco_arch": [8, 12, 14, 17], "reco_b": 18, "reco_model": [12, 14, 17], "reco_param": 12, "reco_predictor": 12, "recogn": 18, "recognit": [6, 10, 11, 12], "recognition_predictor": [8, 18], "recognition_task": [6, 16], "recognitiondataset": [6, 16], "recognitionpredictor": [8, 12], "rectangular": 8, "reduc": [3, 9], "refer": [2, 3, 12, 14, 15, 16, 18], "regardless": 1, "region": 18, "regroup": 10, "regular": 16, "reject": 1, "rel": [7, 9, 10, 18], "relat": 7, "releas": [0, 3], "relev": 15, "religion": 1, "remov": 1, "render": [7, 18], "repo": 8, "repo_id": [8, 14], "report": 1, "repositori": [6, 8, 14], "repres": [1, 17, 18], "represent": [4, 8], "request": [1, 14], "requir": [3, 9, 17], "research": 4, "residu": 8, "resiz": [9, 18], "resnet": 8, "resnet18": [8, 14], "resnet31": 8, "resnet34": 8, "resnet50": [8, 14], "resolv": 7, "resolve_block": 18, "resolve_lin": 18, "resourc": 16, "respect": 1, "rest": [2, 9, 10], "restrict": 13, "result": [2, 6, 7, 11, 14, 17, 18], "return": 18, "reusabl": 18, "review": 1, "rgb": [7, 9], "rgb_mode": 7, "rgb_output": 7, "right": [1, 8, 10], "robust": [4, 6], "root": 6, "rotat": [6, 7, 8, 9, 10, 11, 12, 16, 18], "run": [2, 3, 8], "same": [2, 7, 10, 16, 17, 18], "sampl": [6, 16, 18], "sample_transform": 6, "sar": [4, 8], "sar_resnet31": [8, 18], "satur": 9, "save": [8, 16], "scale": [7, 8, 9, 10], "scale_rang": 9, "scan": [4, 6], "scene": [4, 6, 8], "score": [7, 10], "script": [2, 16], "seamless": 4, "seamlessli": [4, 18], "search": 8, "searchabl": 11, "sec": 18, "second": 18, "section": [12, 14, 15, 17, 18], "secur": [1, 13], "see": [1, 2], "seen": 18, "segment": [4, 8, 18], "self": 18, "semant": [4, 8], "send": 18, "sens": 10, "sensit": 16, "separ": 18, "sequenc": [4, 6, 7, 8, 10, 18], "sequenti": [9, 18], "seri": 1, "seriou": 1, "set": [1, 3, 6, 8, 10, 13, 15, 18], "set_global_polici": 17, "sever": [7, 9, 18], "sex": 1, "sexual": 1, "shade": 9, "shape": [4, 7, 8, 9, 10, 18], "share": [13, 16], "shift": 9, "shm": 13, "should": [2, 6, 7, 9, 10], "show": [4, 7, 8, 10, 12, 14, 15], "showcas": [2, 11], "shuffl": [6, 9], "side": 10, "signatur": 7, "signific": 16, "simpl": [4, 8, 17], "simpler": 8, "sinc": [6, 16], "singl": [1, 2, 4, 6], "single_img_doc": 17, "size": [1, 6, 7, 9, 15, 18], "skew": 18, "slack": 2, "slightli": 8, "small": [2, 8, 18], "smallest": 7, "snapshot_download": 8, "snippet": 18, "so": [2, 3, 6, 8, 14, 16], "social": 1, "socio": 1, "some": [3, 11, 14, 16], "someth": 2, "somewher": 2, "sort": 1, "sourc": [6, 7, 8, 9, 10, 14], "space": [1, 18], "span": 18, "spanish": 6, "spatial": [4, 6, 7], "specif": [2, 3, 10, 12, 16, 18], "specifi": [1, 6, 7], "speed": [4, 8, 18], "sphinx": 2, "sroie": [4, 6, 16], "stabl": 3, "stackoverflow": 2, "stage": 4, "standalon": 11, "standard": 9, "start": 6, "state": [4, 10, 15], "static": 10, "statu": 1, "std": [9, 12], "step": 13, "still": 18, "str": [6, 7, 8, 9, 10], "straight": [6, 8, 16, 18], "straighten": 18, "straighten_pag": [8, 12, 18], "straigten_pag": 12, "stream": 7, "street": [4, 6], "strict": 3, "strictli": 10, "string": [6, 7, 10, 18], "strive": 3, "strong": [4, 8], "structur": [17, 18], "subset": [6, 18], "suggest": [2, 14], "sum": 10, "summari": 10, "support": [3, 12, 15, 17, 18], "sustain": 1, "svhn": [4, 6, 16], "svt": [6, 16], "swedish": 6, "symmetr": [8, 9, 18], "symmetric_pad": [8, 9, 18], "synthet": 4, "synthtext": [4, 6, 16], "system": 18, "t": [2, 6, 12, 17, 18], "tabl": [14, 15, 16], "take": [1, 6, 18], "target": [6, 7, 9, 10, 16], "target_s": 6, "task": [4, 6, 8, 14, 16, 18], "task2": 6, "team": 3, "techminde": 3, "templat": [2, 4], "tensor": [6, 7, 9, 18], "tensorflow": [3, 4, 7, 8, 9, 12, 14, 17, 18], "tensorspec": 17, "term": 1, "test": [6, 16], "test_set": 6, "text": [6, 7, 8, 10, 16], "text_output": 18, "textmatch": 10, "textnet": 8, "textnet_bas": 8, "textnet_smal": 8, "textnet_tini": 8, "textract": [4, 18], "textstylebrush": [4, 6], "textual": [4, 6, 7, 8, 18], "tf": [3, 7, 8, 9, 14, 17], "than": [2, 10, 14], "thank": 2, "thei": [1, 10], "them": [6, 18], "thi": [1, 2, 3, 5, 6, 9, 10, 12, 13, 14, 16, 17, 18], "thing": [17, 18], "third": 3, "those": [1, 7, 18], "threaten": 1, "threshold": 18, "through": [1, 9, 15, 16], "tilman": 14, "time": [1, 4, 8, 10, 16], "tini": 8, "titl": [7, 18], "tm": 18, "tmp": 13, "togeth": [2, 7], "tograi": 9, "tool": 16, "top": [10, 17, 18], "topic": 2, "torch": [3, 9, 12, 14, 17], "torchvis": 9, "total": 12, "toward": [1, 3], "train": [2, 6, 8, 9, 14, 15, 16, 17, 18], "train_it": [6, 16], "train_load": [6, 16], "train_pytorch": 14, "train_set": [6, 16], "train_tensorflow": 14, "trainabl": [4, 8], "tranform": 9, "transcrib": 18, "transfer": [4, 6], "transfo": 9, "transform": [4, 6, 8], "translat": 1, "troll": 1, "true": [6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18], "truth": 10, "tune": 17, "tupl": [6, 7, 9, 10], "two": [7, 13], "txt": 6, "type": [7, 10, 14, 17, 18], "typic": 18, "u": [1, 2], "ucsd": 6, "udac": 2, "uint8": [7, 8, 10, 18], "ukrainian": 6, "unaccept": 1, "underli": [16, 18], "underneath": 7, "understand": [4, 6, 18], "uniform": [8, 9], "uniformli": 9, "uninterrupt": [7, 18], "union": 10, "unittest": 2, "unlock": 7, "unoffici": 8, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [8, 18], "updat": 10, "upgrad": 2, "upper": [6, 9], "uppercas": 16, "url": 7, "us": [1, 2, 3, 6, 8, 10, 11, 12, 13, 14, 15, 18], "usabl": 18, "usag": [13, 17], "use_polygon": [6, 10, 16], "useabl": 18, "user": [4, 7, 11], "utf": 18, "util": 17, "v1": 14, "v3": [8, 14, 18], "valid": 16, "valu": [2, 7, 9, 18], "valuabl": 4, "variabl": 13, "varieti": 6, "veri": 8, "version": [1, 2, 3, 17, 18], "vgg": 8, "vgg16": 14, "vgg16_bn_r": 8, "via": 1, "vietnames": 6, "view": [4, 6], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 6, 8], "visiondataset": 6, "visiontransform": 8, "visual": [3, 4, 15], "visualize_pag": 10, "vit_": 8, "vit_b": 8, "vitstr": [4, 8, 17], "vitstr_bas": [8, 18], "vitstr_smal": [8, 12, 17, 18], "viz": 3, "vocab": [12, 14, 16, 17, 18], "vocabulari": [6, 12, 14], "w": [7, 8, 9, 10], "w3": 18, "wa": 1, "wai": [1, 4, 16], "want": [2, 17, 18], "warmup": 18, "wasn": 2, "we": [1, 2, 3, 4, 7, 9, 12, 14, 16, 17, 18], "weasyprint": 7, "web": [2, 7], "websit": 6, "welcom": 1, "well": [1, 17], "were": [1, 7, 18], "what": 1, "when": [1, 2, 8], "whenev": 2, "where": [2, 7, 9, 10], "whether": [2, 6, 7, 9, 10, 16, 18], "which": [1, 8, 13, 15, 16, 18], "whichev": 3, "while": [9, 18], "why": 1, "width": [7, 9], "wiki": 1, "wildreceipt": [4, 6, 16], "window": [8, 10], "wish": 2, "within": 1, "without": [1, 6, 8], "wonder": 2, "word": [4, 6, 8, 10, 18], "word_1_1": 18, "word_1_2": 18, "word_1_3": 18, "wordgener": [6, 16], "words_onli": 10, "work": [12, 13, 18], "workflow": 2, "worklow": 2, "world": [10, 18], "worth": 8, "wrap": 18, "wrapper": [6, 9], "write": 13, "written": [1, 7], "www": [1, 7, 18], "x": [7, 9, 10], "x_ascend": 18, "x_descend": 18, "x_i": 10, "x_size": 18, "x_wconf": 18, "xhtml": 18, "xmax": 7, "xmin": 7, "xml": 18, "xml_bytes_str": 18, "xml_element": 18, "xml_output": 18, "xmln": 18, "y": 10, "y_i": 10, "y_j": 10, "yet": 15, "ymax": 7, "ymin": 7, "yolov8": 15, "you": [2, 3, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18], "your": [2, 4, 7, 10, 18], "yoursit": 7, "zero": [9, 10], "zoo": 12, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 6, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 6, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 6, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 6, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 6, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 6, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 6, "\u00e4\u00f6\u00e4\u00f6": 6, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 6, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 6, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 6, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 6, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 6, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 6, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 6, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 6, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 6, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 6, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 6, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 6, "\u067e\u0686\u06a2\u06a4\u06af": 6, "\u0905": 6, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 6, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 6, "\u0950": 6, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 6, "\u09bd": 6, "\u09ce": 6, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 6}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 18, "approach": 18, "architectur": 18, "arg": [6, 7, 8, 9, 10], "artefact": 7, "artefactdetect": 15, "attribut": 1, "avail": [15, 16, 18], "aw": 13, "ban": 1, "block": 7, "bug": 2, "changelog": 0, "choos": [16, 18], "classif": [8, 12, 14], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 14, "compos": 9, "conda": 3, "conduct": 1, "connect": 2, "continu": 2, "contrib": 5, "contribut": [2, 5, 15], "contributor": 1, "convent": 14, "correct": 1, "coven": 1, "custom": [6, 12], "data": 16, "dataload": 6, "dataset": [4, 6, 16], "detect": [4, 8, 14, 16, 18], "develop": 2, "do": 18, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 11], "document": [2, 4, 7], "end": 18, "enforc": 1, "evalu": 10, "export": 17, "factori": 8, "featur": [2, 4], "feedback": 2, "file": 7, "from": 14, "gener": [6, 16], "git": 3, "guidelin": 1, "half": 17, "hub": 14, "huggingfac": 14, "i": 18, "infer": 17, "instal": [2, 3], "integr": [2, 15], "io": 7, "lambda": 13, "let": 2, "line": 7, "linux": 3, "load": [12, 14, 16], "loader": 6, "main": 4, "mode": 2, "model": [4, 8, 12, 14, 17, 18], "modifi": 2, "modul": [5, 15], "name": 14, "notebook": 11, "object": 16, "ocr": [16, 18], "onli": 3, "onnx": 17, "optim": 17, "option": 18, "orient": 12, "our": 1, "output": 18, "own": [12, 16], "packag": 3, "page": 7, "perman": 1, "pipelin": 15, "pledg": 1, "precis": 17, "predictor": 18, "prepar": 17, "prerequisit": 3, "pretrain": 14, "push": 14, "python": 3, "qualiti": 2, "question": 2, "read": 7, "readi": 16, "recognit": [4, 8, 14, 16, 18], "report": 2, "request": 2, "respons": 1, "return": [6, 7, 8, 10], "right": 18, "scope": 1, "share": 14, "should": 18, "stage": 18, "standard": 1, "structur": [2, 7], "style": 2, "support": [4, 5, 6, 9], "synthet": [6, 16], "task": 10, "temporari": 1, "test": 2, "text": [4, 18], "train": 12, "transform": 9, "two": 18, "unit": 2, "us": [16, 17], "util": 10, "v0": 0, "verif": 2, "via": 3, "visual": 10, "vocab": 6, "warn": 1, "what": 18, "word": 7, "your": [12, 14, 15, 16, 17], "zoo": [4, 8]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[13, null]], "Advanced options": [[18, "advanced-options"]], "Args:": [[6, "args"], [6, "id4"], [6, "id7"], [6, "id10"], [6, "id13"], [6, "id16"], [6, "id19"], [6, "id22"], [6, "id25"], [6, "id29"], [6, "id32"], [6, "id37"], [6, "id40"], [6, "id46"], [6, "id49"], [6, "id50"], [6, "id51"], [6, "id54"], [6, "id57"], [6, "id60"], [6, "id61"], [7, "args"], [7, "id2"], [7, "id3"], [7, "id4"], [7, "id5"], [7, "id6"], [7, "id7"], [7, "id10"], [7, "id12"], [7, "id14"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id28"], [8, "args"], [8, "id3"], [8, "id8"], [8, "id13"], [8, "id17"], [8, "id21"], [8, "id26"], [8, "id31"], [8, "id36"], [8, "id41"], [8, "id46"], [8, "id50"], [8, "id54"], [8, "id59"], [8, "id63"], [8, "id68"], [8, "id73"], [8, "id77"], [8, "id81"], [8, "id85"], [8, "id90"], [8, "id95"], [8, "id99"], [8, "id104"], [8, "id109"], [8, "id114"], [8, "id119"], [8, "id123"], [8, "id127"], [8, "id132"], [8, "id137"], [8, "id142"], [8, "id146"], [8, "id150"], [8, "id155"], [8, "id159"], [8, "id163"], [8, "id167"], [8, "id169"], [8, "id171"], [8, "id173"], [9, "args"], [9, "id1"], [9, "id2"], [9, "id3"], [9, "id4"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"], [9, "id12"], [9, "id13"], [9, "id14"], [9, "id15"], [9, "id16"], [9, "id17"], [9, "id18"], [9, "id19"], [10, "args"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"]], "Artefact": [[7, "artefact"]], "ArtefactDetection": [[15, "artefactdetection"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[16, "available-datasets"]], "Available architectures": [[18, "available-architectures"], [18, "id1"], [18, "id2"]], "Available contribution modules": [[15, "available-contribution-modules"]], "Block": [[7, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[16, null]], "Choosing the right model": [[18, null]], "Classification": [[14, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[9, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[6, "custom-dataset-loader"]], "Custom orientation classification models": [[12, "custom-orientation-classification-models"]], "Data Loading": [[16, "data-loading"]], "Dataloader": [[6, "dataloader"]], "Detection": [[14, "detection"], [16, "detection"]], "Detection predictors": [[18, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[7, "document"]], "Document structure": [[7, "document-structure"]], "End-to-End OCR": [[18, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[17, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[7, "file-reading"]], "Half-precision": [[17, "half-precision"]], "Installation": [[3, null]], "Integrate contributions into your pipeline": [[15, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[7, "line"]], "Loading from Huggingface Hub": [[14, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[12, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[12, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[17, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[14, "naming-conventions"]], "OCR": [[16, "ocr"]], "Object Detection": [[16, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[7, "page"]], "Preparing your model for inference": [[17, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[14, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[14, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[14, "recognition"], [16, "recognition"]], "Recognition predictors": [[18, "recognition-predictors"]], "Returns:": [[6, "returns"], [7, "returns"], [7, "id11"], [7, "id13"], [7, "id15"], [7, "id19"], [7, "id23"], [7, "id27"], [7, "id31"], [8, "returns"], [8, "id6"], [8, "id11"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id29"], [8, "id34"], [8, "id39"], [8, "id44"], [8, "id49"], [8, "id53"], [8, "id57"], [8, "id62"], [8, "id66"], [8, "id71"], [8, "id76"], [8, "id80"], [8, "id84"], [8, "id88"], [8, "id93"], [8, "id98"], [8, "id102"], [8, "id107"], [8, "id112"], [8, "id117"], [8, "id122"], [8, "id126"], [8, "id130"], [8, "id135"], [8, "id140"], [8, "id145"], [8, "id149"], [8, "id153"], [8, "id158"], [8, "id162"], [8, "id166"], [8, "id168"], [8, "id170"], [8, "id172"], [10, "returns"]], "Scope": [[1, "scope"]], "Share your model with the community": [[14, null]], "Supported Vocabs": [[6, "supported-vocabs"]], "Supported contribution modules": [[5, "supported-contribution-modules"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[9, "supported-transformations"]], "Synthetic dataset generator": [[6, "synthetic-dataset-generator"], [16, "synthetic-dataset-generator"]], "Task evaluation": [[10, "task-evaluation"]], "Text Detection": [[18, "text-detection"]], "Text Recognition": [[18, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[12, null]], "Two-stage approaches": [[18, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[16, "use-your-own-datasets"]], "Using your ONNX exported model": [[17, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[3, "via-conda-only-for-linux"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[10, "visualization"]], "What should I do with the output?": [[18, "what-should-i-do-with-the-output"]], "Word": [[7, "word"]], "docTR Notebooks": [[11, null]], "docTR Vocabs": [[6, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.contrib": [[5, null]], "doctr.datasets": [[6, null], [6, "datasets"]], "doctr.io": [[7, null]], "doctr.models": [[8, null]], "doctr.models.classification": [[8, "doctr-models-classification"]], "doctr.models.detection": [[8, "doctr-models-detection"]], "doctr.models.factory": [[8, "doctr-models-factory"]], "doctr.models.recognition": [[8, "doctr-models-recognition"]], "doctr.models.zoo": [[8, "doctr-models-zoo"]], "doctr.transforms": [[9, null]], "doctr.utils": [[10, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[7, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[7, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[9, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[6, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[9, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[9, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[6, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[8, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[6, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[8, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[8, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[7, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[8, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[6, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[6, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[7, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[7, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[6, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[6, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[9, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[9, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[6, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[6, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[6, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[6, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[6, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[8, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[9, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[7, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[8, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[6, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[9, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[8, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[6, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[9, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[7, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[8, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[9, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[9, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[9, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[9, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[9, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[9, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[9, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[9, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[9, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[9, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[9, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[9, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[7, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[7, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[7, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[7, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[6, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[9, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[7, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[7, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[6, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[10, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[10, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[10, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[10, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[6, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[6, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[6, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[9, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[10, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[10, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[10, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[10, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[10, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[8, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[8, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[6, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[7, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[6, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[6, 0, 1, "", "CORD"], [6, 0, 1, "", "CharacterGenerator"], [6, 0, 1, "", "DetectionDataset"], [6, 0, 1, "", "DocArtefacts"], [6, 0, 1, "", "FUNSD"], [6, 0, 1, "", "IC03"], [6, 0, 1, "", "IC13"], [6, 0, 1, "", "IIIT5K"], [6, 0, 1, "", "IIITHWS"], [6, 0, 1, "", "IMGUR5K"], [6, 0, 1, "", "MJSynth"], [6, 0, 1, "", "OCRDataset"], [6, 0, 1, "", "RecognitionDataset"], [6, 0, 1, "", "SROIE"], [6, 0, 1, "", "SVHN"], [6, 0, 1, "", "SVT"], [6, 0, 1, "", "SynthText"], [6, 0, 1, "", "WILDRECEIPT"], [6, 0, 1, "", "WordGenerator"], [6, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[6, 0, 1, "", "DataLoader"]], "doctr.io": [[7, 0, 1, "", "Artefact"], [7, 0, 1, "", "Block"], [7, 0, 1, "", "Document"], [7, 0, 1, "", "DocumentFile"], [7, 0, 1, "", "Line"], [7, 0, 1, "", "Page"], [7, 0, 1, "", "Word"], [7, 1, 1, "", "decode_img_as_tensor"], [7, 1, 1, "", "read_html"], [7, 1, 1, "", "read_img_as_numpy"], [7, 1, 1, "", "read_img_as_tensor"], [7, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[7, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[7, 2, 1, "", "from_images"], [7, 2, 1, "", "from_pdf"], [7, 2, 1, "", "from_url"]], "doctr.io.Page": [[7, 2, 1, "", "show"]], "doctr.models": [[8, 1, 1, "", "kie_predictor"], [8, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[8, 1, 1, "", "crop_orientation_predictor"], [8, 1, 1, "", "magc_resnet31"], [8, 1, 1, "", "mobilenet_v3_large"], [8, 1, 1, "", "mobilenet_v3_large_r"], [8, 1, 1, "", "mobilenet_v3_small"], [8, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [8, 1, 1, "", "mobilenet_v3_small_page_orientation"], [8, 1, 1, "", "mobilenet_v3_small_r"], [8, 1, 1, "", "page_orientation_predictor"], [8, 1, 1, "", "resnet18"], [8, 1, 1, "", "resnet31"], [8, 1, 1, "", "resnet34"], [8, 1, 1, "", "resnet50"], [8, 1, 1, "", "textnet_base"], [8, 1, 1, "", "textnet_small"], [8, 1, 1, "", "textnet_tiny"], [8, 1, 1, "", "vgg16_bn_r"], [8, 1, 1, "", "vit_b"], [8, 1, 1, "", "vit_s"]], "doctr.models.detection": [[8, 1, 1, "", "db_mobilenet_v3_large"], [8, 1, 1, "", "db_resnet50"], [8, 1, 1, "", "detection_predictor"], [8, 1, 1, "", "fast_base"], [8, 1, 1, "", "fast_small"], [8, 1, 1, "", "fast_tiny"], [8, 1, 1, "", "linknet_resnet18"], [8, 1, 1, "", "linknet_resnet34"], [8, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[8, 1, 1, "", "from_hub"], [8, 1, 1, "", "login_to_hub"], [8, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[8, 1, 1, "", "crnn_mobilenet_v3_large"], [8, 1, 1, "", "crnn_mobilenet_v3_small"], [8, 1, 1, "", "crnn_vgg16_bn"], [8, 1, 1, "", "master"], [8, 1, 1, "", "parseq"], [8, 1, 1, "", "recognition_predictor"], [8, 1, 1, "", "sar_resnet31"], [8, 1, 1, "", "vitstr_base"], [8, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[9, 0, 1, "", "ChannelShuffle"], [9, 0, 1, "", "ColorInversion"], [9, 0, 1, "", "Compose"], [9, 0, 1, "", "GaussianBlur"], [9, 0, 1, "", "GaussianNoise"], [9, 0, 1, "", "LambdaTransformation"], [9, 0, 1, "", "Normalize"], [9, 0, 1, "", "OneOf"], [9, 0, 1, "", "RandomApply"], [9, 0, 1, "", "RandomBrightness"], [9, 0, 1, "", "RandomContrast"], [9, 0, 1, "", "RandomCrop"], [9, 0, 1, "", "RandomGamma"], [9, 0, 1, "", "RandomHorizontalFlip"], [9, 0, 1, "", "RandomHue"], [9, 0, 1, "", "RandomJpegQuality"], [9, 0, 1, "", "RandomResize"], [9, 0, 1, "", "RandomRotate"], [9, 0, 1, "", "RandomSaturation"], [9, 0, 1, "", "RandomShadow"], [9, 0, 1, "", "Resize"], [9, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[10, 0, 1, "", "DetectionMetric"], [10, 0, 1, "", "LocalizationConfusion"], [10, 0, 1, "", "OCRMetric"], [10, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.visualization": [[10, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 7, 8, 10, 14, 17], "0": [1, 3, 6, 9, 10, 12, 15, 16, 18], "00": 18, "01": 18, "0123456789": 6, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 6, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 6, "02562": 8, "03": 18, "035": 18, "0361328125": 18, "04": 18, "05": 18, "06": 18, "06640625": 18, "07": 18, "08": [9, 18], "09": 18, "0966796875": 18, "1": [6, 7, 8, 9, 10, 12, 16, 18], "10": [6, 10, 18], "100": [6, 9, 10, 16, 18], "1000": 18, "101": 6, "1024": [8, 12, 18], "104": 6, "106": 6, "108": 6, "1095": 16, "11": 18, "110": 10, "1107": 16, "114": 6, "115": 6, "1156": 16, "116": 6, "118": 6, "11800h": 18, "11th": 18, "12": 18, "120": 6, "123": 6, "126": 6, "1268": 16, "128": [8, 12, 17, 18], "13": 18, "130": 6, "13068": 16, "131": 6, "1337891": 16, "1357421875": 18, "1396484375": 18, "14": 18, "1420": 18, "14470v1": 6, "149": 16, "15": 18, "150": [10, 18], "1552": 18, "16": [8, 17, 18], "1630859375": 18, "1684": 18, "16x16": 8, "17": 18, "1778": 18, "1782": 18, "18": [8, 18], "185546875": 18, "1900": 18, "1910": 8, "19342": 16, "19370": 16, "195": 6, "19598": 16, "199": 18, "1999": 18, "2": [3, 4, 6, 7, 9, 15, 18], "20": 18, "200": 10, "2000": 16, "2003": [4, 6], "2012": 6, "2013": [4, 6], "2015": 6, "2019": 4, "207901": 16, "21": 18, "2103": 6, "2186": 16, "21888": 16, "22": 18, "224": [8, 9], "225": 9, "22672": 16, "229": [9, 16], "23": 18, "233": 16, "236": 6, "24": 18, "246": 16, "249": 16, "25": 18, "2504": 18, "255": [7, 8, 9, 10, 18], "256": 8, "257": 16, "26": 18, "26032": 16, "264": 12, "27": 18, "2700": 16, "2710": 18, "2749": 12, "28": 18, "287": 12, "29": 18, "296": 12, "299": 12, "2d": 18, "3": [3, 4, 7, 8, 9, 10, 17, 18], "30": 18, "300": 16, "3000": 16, "301": 12, "30595": 18, "30ghz": 18, "31": 8, "32": [6, 8, 9, 12, 16, 17, 18], "3232421875": 18, "33": [9, 18], "33402": 16, "33608": 16, "34": [8, 18], "340": 18, "3456": 18, "3515625": 18, "36": 18, "360": 16, "37": [6, 18], "38": 18, "39": 18, "4": [8, 9, 10, 18], "40": 18, "406": 9, "41": 18, "42": 18, "43": 18, "44": 18, "45": 18, "456": 9, "46": 18, "47": 18, "472": 16, "48": [6, 18], "485": 9, "49": 18, "49377": 16, "5": [6, 9, 10, 15, 18], "50": [8, 16, 18], "51": 18, "51171875": 18, "512": 8, "52": [6, 18], "529": 18, "53": 18, "54": 18, "540": 18, "5478515625": 18, "55": 18, "56": 18, "57": 18, "58": [6, 18], "580": 18, "5810546875": 18, "583": 18, "59": 18, "597": 18, "5k": [4, 6], "5m": 18, "6": [9, 18], "60": 9, "600": [8, 10, 18], "61": 18, "62": 18, "626": 16, "63": 18, "64": [8, 9, 18], "641": 18, "647": 16, "65": 18, "66": 18, "67": 18, "68": 18, "69": 18, "693": 12, "694": 12, "695": 12, "6m": 18, "7": 18, "70": [6, 10, 18], "707470": 16, "71": [6, 18], "7100000": 16, "7141797": 16, "7149": 16, "72": 18, "72dpi": 7, "73": 18, "73257": 16, "74": 18, "75": [9, 18], "7581382": 16, "76": 18, "77": 18, "772": 12, "772875": 16, "78": 18, "785": 12, "79": 18, "793533": 16, "796": 16, "798": 12, "7m": 18, "8": [8, 9, 18], "80": 18, "800": [8, 10, 16, 18], "81": 18, "82": 18, "83": 18, "84": 18, "849": 16, "85": 18, "8564453125": 18, "857": 18, "85875": 16, "86": 18, "8603515625": 18, "87": 18, "8707": 16, "88": 18, "89": 18, "9": [3, 9, 18], "90": 18, "90k": 6, "90kdict32px": 6, "91": 18, "914085328578949": 18, "92": 18, "93": 18, "94": [6, 18], "95": [10, 18], "9578408598899841": 18, "96": 18, "97": 18, "98": 18, "99": 18, "9949972033500671": 18, "A": [1, 2, 4, 6, 7, 8, 11, 17], "As": 2, "Be": 18, "Being": 1, "By": 13, "For": [1, 2, 3, 12, 18], "If": [2, 7, 8, 12, 18], "In": [2, 6, 16], "It": [9, 14, 15, 17], "Its": [4, 8], "No": [1, 18], "Of": 6, "Or": [15, 17], "The": [1, 2, 6, 7, 10, 13, 15, 16, 17, 18], "Then": 8, "To": [2, 3, 13, 14, 15, 17, 18], "_": [1, 6, 8], "__call__": 18, "_build": 2, "_i": 10, "ab": 6, "abc": 17, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 6, "abdef": [6, 16], "abl": [16, 18], "about": [1, 16, 18], "abov": 18, "abstractdataset": 6, "abus": 1, "accept": 1, "access": [4, 7, 16, 18], "account": [1, 14], "accur": 18, "accuraci": 10, "achiev": 17, "act": 1, "action": 1, "activ": 4, "ad": [2, 8, 9], "adapt": 1, "add": [9, 10, 14, 18], "add_hook": 18, "add_label": 10, "addit": [2, 3, 7, 15, 18], "addition": [2, 18], "address": [1, 7], "adjust": 9, "advanc": 1, "advantag": 17, "advis": 2, "aesthet": [4, 6], "affect": 1, "after": [14, 18], "ag": 1, "again": 8, "aggreg": [10, 16], "aggress": 1, "align": [1, 7, 9], "all": [1, 2, 5, 6, 7, 9, 10, 15, 16, 18], "allow": [1, 17], "along": 18, "alreadi": [2, 17], "also": [1, 8, 14, 15, 16, 18], "alwai": 16, "an": [1, 2, 4, 6, 7, 8, 10, 15, 17, 18], "analysi": [7, 15], "ancient_greek": 6, "angl": [7, 9], "ani": [1, 6, 7, 8, 9, 10, 17, 18], "annot": 6, "anot": 16, "anoth": [8, 12, 16], "answer": 1, "anyascii": 10, "anyon": 4, "anyth": 15, "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 6, 9], "applic": [4, 8], "appoint": 1, "appreci": 14, "appropri": [1, 2, 18], "ar": [1, 2, 3, 5, 6, 7, 9, 10, 11, 15, 16, 18], "arab": 6, "arabic_diacrit": 6, "arabic_lett": 6, "arabic_punctu": 6, "arbitrarili": [4, 8], "arch": [8, 14], "architectur": [4, 8, 14, 15], "area": 18, "argument": [6, 7, 8, 10, 12, 18], "around": 1, "arrai": [7, 9, 10], "art": [4, 15], "artefact": [10, 15, 18], "artefact_typ": 7, "artifici": [4, 6], "arxiv": [6, 8], "asarrai": 10, "ascii_lett": 6, "aspect": [4, 8, 9, 18], "assess": 10, "assign": 10, "associ": 7, "assum": 8, "assume_straight_pag": [8, 12, 18], "astyp": [8, 10, 18], "attack": 1, "attend": [4, 8], "attent": [1, 8], "autom": 4, "automat": 18, "autoregress": [4, 8], "avail": [1, 4, 5, 9], "averag": [9, 18], "avoid": [1, 3], "aw": [4, 18], "awar": 18, "azur": 18, "b": [8, 10, 18], "b_j": 10, "back": 2, "backbon": 8, "backend": 18, "background": 16, "bangla": 6, "bar": 15, "bar_cod": 16, "base": [4, 8, 15], "baselin": [4, 8, 18], "batch": [6, 8, 9, 15, 16, 18], "batch_siz": [6, 12, 15, 16, 17], "bblanchon": 3, "bbox": 18, "becaus": 13, "been": [2, 10, 16, 18], "befor": [6, 8, 9, 18], "begin": 10, "behavior": [1, 18], "being": [10, 18], "belong": 18, "benchmark": 18, "best": 1, "better": [11, 18], "between": [9, 10, 18], "bgr": 7, "bilinear": 9, "bin_thresh": 18, "binar": [4, 8, 18], "binari": [7, 17, 18], "bit": 17, "block": [10, 18], "block_1_1": 18, "blur": 9, "bmvc": 6, "bn": 14, "bodi": [1, 18], "bool": [6, 7, 8, 9, 10], "boolean": [8, 18], "both": [4, 6, 9, 16, 18], "bottom": [8, 18], "bound": [6, 7, 8, 9, 10, 15, 16, 18], "box": [6, 7, 8, 9, 10, 15, 16, 18], "box_thresh": 18, "bright": 9, "browser": [2, 4], "build": [2, 3, 17], "built": 2, "byte": [7, 18], "c": [3, 7, 10], "c_j": 10, "cach": [2, 6, 13], "cache_sampl": 6, "call": 17, "callabl": [6, 9], "can": [2, 3, 12, 13, 14, 15, 16, 18], "capabl": [2, 11, 18], "case": [6, 10], "cf": 18, "cfg": 18, "challeng": 6, "challenge2_test_task12_imag": 6, "challenge2_test_task1_gt": 6, "challenge2_training_task12_imag": 6, "challenge2_training_task1_gt": 6, "chang": [13, 18], "channel": [1, 2, 7, 9], "channel_prior": 3, "channelshuffl": 9, "charact": [4, 6, 7, 10, 16, 18], "charactergener": [6, 16], "characterist": 1, "charg": 18, "charset": 18, "chart": 7, "check": [2, 14, 18], "checkpoint": 8, "chip": 3, "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 6, 7, 9, 10, 18], "class_nam": 12, "classif": [16, 18], "classmethod": 7, "clear": 2, "clone": 3, "close": 2, "co": 14, "code": [4, 7, 15], "codecov": 2, "colab": 11, "collate_fn": 6, "collect": [7, 15], "color": 9, "colorinvers": 9, "column": 7, "com": [1, 3, 7, 8, 14], "combin": 18, "command": [2, 15], "comment": 1, "commit": 1, "common": [1, 9, 10, 17], "commun": 1, "compar": 4, "comparison": [10, 18], "competit": 6, "compil": [11, 18], "complaint": 1, "complementari": 10, "complet": 2, "compon": 18, "compos": [6, 18], "comprehens": 18, "comput": [6, 10, 17, 18], "conf_threshold": 15, "confid": [7, 18], "config": [3, 8], "configur": 8, "confus": 10, "consecut": [9, 18], "consequ": 1, "consid": [1, 2, 6, 7, 10, 18], "consist": 18, "consolid": [4, 6], "constant": 9, "construct": 1, "contact": 1, "contain": [5, 6, 11, 16, 18], "content": [6, 7, 18], "context": 8, "contib": 3, "continu": 1, "contrast": 9, "contrast_factor": 9, "contrib": [3, 15], "contribut": 1, "contributor": 2, "convers": 7, "convert": [7, 9], "convolut": 8, "coordin": [7, 18], "cord": [4, 6, 16, 18], "core": [10, 18], "corner": 18, "correct": 9, "correspond": [3, 7, 9, 18], "could": [1, 15], "counterpart": 10, "cover": 2, "coverag": 2, "cpu": [4, 12, 17], "creat": 14, "crnn": [4, 8, 14], "crnn_mobilenet_v3_larg": [8, 14, 18], "crnn_mobilenet_v3_smal": [8, 17, 18], "crnn_vgg16_bn": [8, 12, 14, 18], "crop": [7, 8, 9, 12, 16, 18], "crop_orient": [7, 18], "crop_orientation_predictor": [8, 12], "crop_param": 12, "cuda": 17, "currenc": 6, "current": [2, 12, 18], "custom": [14, 15, 17, 18], "custom_crop_orientation_model": 12, "custom_page_orientation_model": 12, "customhook": 18, "cvit": 4, "czczup": 8, "czech": 6, "d": [6, 16], "danish": 6, "data": [4, 6, 7, 9, 10, 12, 14], "dataload": 16, "dataset": [8, 12, 18], "dataset_info": 6, "date": [12, 18], "db": 14, "db_mobilenet_v3_larg": [8, 14, 18], "db_resnet34": 18, "db_resnet50": [8, 12, 14, 18], "dbnet": [4, 8], "deal": [11, 18], "decis": 1, "decod": 7, "decode_img_as_tensor": 7, "dedic": 17, "deem": 1, "deep": [8, 18], "def": 18, "default": [3, 7, 12, 13, 18], "defer": 16, "defin": [10, 17], "degre": [7, 9, 18], "degress": 7, "delet": 2, "delimit": 18, "delta": 9, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4, 18], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": 8, "descript": 11, "design": 9, "desir": 7, "det_arch": [8, 12, 14, 17], "det_b": 18, "det_model": [12, 14, 17], "det_param": 12, "det_predictor": [12, 18], "detail": [12, 18], "detect": [6, 7, 10, 11, 12, 15], "detect_languag": 8, "detect_orient": [8, 12, 18], "detection_predictor": [8, 18], "detection_task": [6, 16], "detectiondataset": [6, 16], "detectionmetr": 10, "detectionpredictor": [8, 12], "detector": [4, 8, 15], "deterior": 8, "determin": 1, "dev": [2, 13], "develop": 3, "deviat": 9, "devic": 17, "dict": [7, 10, 18], "dictionari": [7, 10], "differ": 1, "differenti": [4, 8], "digit": [4, 6, 16], "dimens": [7, 10, 18], "dimension": 9, "direct": 6, "directli": [14, 18], "directori": [2, 13], "disabl": [1, 13, 18], "disable_crop_orient": 18, "disable_page_orient": 18, "disclaim": 18, "discuss": 2, "disparag": 1, "displai": [7, 10], "display_artefact": 10, "distribut": 9, "div": 18, "divers": 1, "divid": 7, "do": [2, 3, 8], "doc": [2, 7, 15, 17, 18], "docartefact": [6, 16], "docstr": 2, "doctr": [3, 12, 13, 14, 15, 16, 17, 18], "doctr_cache_dir": 13, "doctr_multiprocessing_dis": 13, "document": [6, 8, 10, 11, 12, 15, 16, 17, 18], "documentbuild": 18, "documentfil": [7, 12, 14, 15, 17], "doesn": 17, "don": [12, 18], "done": 9, "download": [6, 16], "downsiz": 8, "draw": 9, "drop": 6, "drop_last": 6, "dtype": [7, 8, 9, 10, 17], "dual": [4, 6], "dummi": 14, "dummy_img": 18, "dummy_input": 17, "dure": 1, "dutch": 6, "dynam": [6, 15], "dynamic_seq_length": 6, "e": [1, 2, 3, 7, 8], "each": [4, 6, 7, 8, 9, 10, 16, 18], "eas": 2, "easi": [4, 10, 14, 17], "easili": [7, 10, 12, 14, 16, 18], "econom": 1, "edit": 1, "educ": 1, "effect": 18, "effici": [2, 4, 6, 8], "either": [10, 18], "element": [6, 7, 8, 18], "els": [2, 15], "email": 1, "empathi": 1, "en": 18, "enabl": [6, 7], "enclos": 7, "encod": [4, 6, 7, 8, 18], "encode_sequ": 6, "encount": 2, "encrypt": 7, "end": [4, 6, 8, 10], "english": [6, 16], "enough": [2, 18], "ensur": 2, "entri": 6, "environ": [1, 13], "eo": 6, "equiv": 18, "estim": 8, "etc": [7, 15], "ethnic": 1, "evalu": [16, 18], "event": 1, "everyon": 1, "everyth": [2, 18], "exact": [10, 18], "exampl": [1, 2, 4, 6, 8, 14, 18], "exchang": 17, "execut": 18, "exist": 14, "expand": 9, "expect": [7, 9, 10], "experi": 1, "explan": [1, 18], "explicit": 1, "exploit": [4, 8], "export": [7, 8, 10, 11, 15, 18], "export_as_straight_box": [8, 18], "export_as_xml": 18, "export_model_to_onnx": 17, "express": [1, 9], "extens": 7, "extern": [1, 16], "extract": [4, 6], "extractor": 8, "f_": 10, "f_a": 10, "factor": 9, "fair": 1, "fairli": 1, "fals": [6, 7, 8, 9, 10, 12, 18], "faq": 1, "fascan": 14, "fast": [4, 6, 8], "fast_bas": [8, 18], "fast_smal": [8, 18], "fast_tini": [8, 18], "faster": [4, 8, 17], "fasterrcnn_mobilenet_v3_large_fpn": 8, "favorit": 18, "featur": [3, 8, 10, 11, 12, 15], "feedback": 1, "feel": [2, 14], "felix92": 14, "few": [17, 18], "figsiz": 10, "figur": [10, 15], "file": [2, 6], "final": 8, "find": [2, 16], "finnish": 6, "first": [2, 6], "firsthand": 6, "fit": [8, 18], "flag": 18, "flip": 9, "float": [7, 9, 10, 17], "float32": [7, 8, 9, 17], "fn": 9, "focu": 14, "focus": [1, 6], "folder": 6, "follow": [1, 2, 3, 6, 9, 10, 12, 13, 14, 15, 18], "font": 6, "font_famili": 6, "foral": 10, "forc": 2, "forg": 3, "form": [4, 6, 18], "format": [7, 10, 12, 16, 17, 18], "forpost": [4, 6], "forum": 2, "fp16": 17, "frac": 10, "framework": [3, 14, 16, 18], "free": [1, 2, 14], "french": [6, 12, 14, 18], "friendli": 4, "from": [1, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18], "from_hub": [8, 14], "from_imag": [7, 14, 15, 17], "from_pdf": 7, "from_url": 7, "full": [6, 10, 18], "function": [6, 9, 10, 15], "funsd": [4, 6, 16, 18], "further": 16, "futur": 6, "g": [7, 8], "g_": 10, "g_x": 10, "gamma": 9, "gaussian": 9, "gaussianblur": 9, "gaussiannois": 9, "gen": 18, "gender": 1, "gener": [2, 4, 7, 8], "generic_cyrillic_lett": 6, "geometri": [4, 7, 18], "geq": 10, "german": [6, 12, 14], "get": [17, 18], "git": 14, "github": [2, 3, 8, 14], "give": [1, 15], "given": [6, 7, 9, 10, 18], "global": 8, "go": 18, "good": 17, "googl": 2, "googlevis": 4, "gpu": [4, 15, 17], "gracefulli": 1, "graph": [4, 6, 7], "grayscal": 9, "ground": 10, "groung": 10, "group": [4, 18], "gt": 10, "gt_box": 10, "gt_label": 10, "guid": 2, "guidanc": 16, "gvision": 18, "h": [7, 8, 9], "h_": 10, "ha": [2, 6, 10, 16], "handl": [11, 16, 18], "handwrit": 6, "handwritten": 16, "harass": 1, "hardwar": 18, "harm": 1, "hat": 10, "have": [1, 2, 10, 12, 14, 16, 17, 18], "head": [8, 18], "healthi": 1, "hebrew": 6, "height": [7, 9], "hello": [10, 18], "help": 17, "here": [5, 9, 11, 15, 16, 18], "hf": 8, "hf_hub_download": 8, "high": 7, "higher": [3, 6, 18], "hindi": 6, "hindi_digit": 6, "hocr": 18, "hook": 18, "horizont": [7, 9, 18], "hous": 6, "how": [2, 11, 12, 14, 16], "howev": 16, "hsv": 9, "html": [1, 2, 3, 7, 18], "http": [1, 3, 6, 7, 8, 14, 18], "hub": 8, "hue": 9, "huggingfac": 8, "hw": 6, "i": [1, 2, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17], "i7": 18, "ic03": [4, 6, 16], "ic13": [4, 6, 16], "icdar": [4, 6], "icdar2019": 6, "id": 18, "ident": 1, "identifi": 4, "iiit": [4, 6], "iiit5k": [6, 16], "iiithw": [4, 6, 16], "imag": [4, 6, 7, 8, 9, 10, 14, 15, 16, 18], "imagenet": 8, "imageri": 1, "images_90k_norm": 6, "img": [6, 9, 16, 17], "img_cont": 7, "img_fold": [6, 16], "img_path": 7, "img_transform": 6, "imgur5k": [4, 6, 16], "imgur5k_annot": 6, "imlist": 6, "impact": 1, "implement": [6, 7, 8, 9, 10, 18], "import": [6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18], "improv": 8, "inappropri": 1, "incid": 1, "includ": [1, 6, 16, 17], "inclus": 1, "increas": 9, "independ": 9, "index": [2, 7], "indic": 10, "individu": 1, "infer": [4, 8, 9, 15, 18], "inform": [1, 2, 4, 6, 16], "input": [2, 7, 8, 9, 17, 18], "input_crop": 8, "input_pag": [8, 10, 18], "input_shap": 17, "input_tensor": 8, "inspir": [1, 9], "instal": [14, 15, 17], "instanc": [1, 18], "instanti": [8, 18], "instead": [6, 7, 8], "insult": 1, "int": [6, 7, 9], "int64": 10, "integ": 10, "integr": [4, 14, 16], "intel": 18, "interact": [1, 7, 10], "interfac": [14, 17], "interoper": 17, "interpol": 9, "interpret": [6, 7], "intersect": 10, "invert": 9, "investig": 1, "invis": 1, "involv": [1, 18], "io": [12, 14, 15, 17], "iou": 10, "iou_thresh": 10, "iou_threshold": 15, "irregular": [4, 8, 16], "isn": 6, "issu": [1, 2, 14], "italian": 6, "iter": [6, 9, 16, 18], "its": [7, 8, 9, 10, 16, 18], "itself": [8, 14], "j": 10, "job": 2, "join": 2, "jpeg": 9, "jpegqual": 9, "jpg": [6, 7, 14, 17], "json": [6, 16, 18], "json_output": 18, "jump": 2, "just": 1, "kei": [4, 6], "kera": [8, 17], "kernel": [4, 8, 9], "kernel_shap": 9, "keywoard": 8, "keyword": [6, 7, 8, 10], "kie": [8, 12], "kie_predictor": [8, 12], "kiepredictor": 8, "kind": 1, "know": [2, 17], "kwarg": [6, 7, 8, 10], "l": 10, "l_j": 10, "label": [6, 10, 15, 16], "label_fil": [6, 16], "label_fold": 6, "label_path": [6, 16], "labels_path": [6, 16], "ladder": 1, "lambda": 9, "lambdatransform": 9, "lang": 18, "languag": [1, 4, 6, 7, 8, 14, 18], "larg": [8, 14], "largest": 10, "last": [3, 6], "latenc": 8, "later": 2, "latest": 18, "latin": 6, "layer": 17, "layout": 18, "lead": 1, "leader": 1, "learn": [1, 4, 8, 17, 18], "least": 3, "left": [10, 18], "legacy_french": 6, "length": [6, 18], "less": [17, 18], "level": [1, 6, 10, 18], "leverag": 11, "lf": 14, "librari": [2, 3, 11, 12], "light": 4, "lightweight": 17, "like": 1, "limits_": 10, "line": [4, 8, 10, 18], "line_1_1": 18, "link": 12, "linknet": [4, 8], "linknet_resnet18": [8, 12, 17, 18], "linknet_resnet34": [8, 17, 18], "linknet_resnet50": [8, 18], "list": [6, 7, 9, 10, 14], "ll": 10, "load": [4, 6, 8, 15, 17], "load_state_dict": 12, "load_weight": 12, "loc_pr": 18, "local": [2, 4, 6, 8, 10, 16, 18], "localis": 6, "localizationconfus": 10, "locat": [2, 7, 18], "login": 8, "login_to_hub": [8, 14], "logo": [7, 15, 16], "love": 14, "lower": [9, 10, 18], "m": [2, 10, 18], "m1": 3, "macbook": 3, "machin": 17, "made": 4, "magc_resnet31": 8, "mai": [1, 2], "mail": 1, "main": 11, "maintain": 4, "mainten": 2, "make": [1, 2, 10, 12, 13, 14, 17, 18], "mani": [16, 18], "manipul": 18, "map": [6, 8], "map_loc": 12, "master": [4, 8, 18], "match": [10, 18], "mathcal": 10, "matplotlib": [7, 10], "max": [6, 9, 10], "max_angl": 9, "max_area": 9, "max_char": [6, 16], "max_delta": 9, "max_gain": 9, "max_gamma": 9, "max_qual": 9, "max_ratio": 9, "maximum": [6, 9], "maxval": [8, 9], "mbox": 10, "mean": [9, 10, 12], "meaniou": 10, "meant": [7, 17], "measur": 18, "media": 1, "median": 8, "meet": 12, "member": 1, "memori": [13, 17], "mention": 18, "merg": 6, "messag": 2, "meta": 18, "metadata": 17, "metal": 3, "method": [7, 9, 18], "metric": [10, 18], "middl": 18, "might": [17, 18], "min": 9, "min_area": 9, "min_char": [6, 16], "min_gain": 9, "min_gamma": 9, "min_qual": 9, "min_ratio": 9, "min_val": 9, "minde": [1, 3, 4, 8], "minim": [2, 4], "minimalist": [4, 8], "minimum": [3, 6, 9, 10, 18], "minval": 9, "miss": 3, "mistak": 1, "mixed_float16": 17, "mixed_precis": 17, "mjsynth": [4, 6, 16], "mnt": 6, "mobilenet": [8, 14], "mobilenet_v3_larg": 8, "mobilenet_v3_large_r": 8, "mobilenet_v3_smal": [8, 12], "mobilenet_v3_small_crop_orient": [8, 12], "mobilenet_v3_small_page_orient": [8, 12], "mobilenet_v3_small_r": 8, "mobilenetv3": 8, "modal": [4, 6], "mode": 3, "model": [6, 10, 13, 15, 16], "model_nam": [8, 14, 17], "model_path": [15, 17], "moder": 1, "modif": 2, "modifi": [8, 13, 18], "modul": [3, 7, 8, 9, 10, 18], "more": [2, 16, 18], "most": 18, "mozilla": 1, "multi": [4, 8], "multilingu": [6, 14], "multipl": [6, 7, 9, 18], "multipli": 9, "multiprocess": 13, "my": 8, "my_awesome_model": 14, "my_hook": 18, "n": [6, 10], "name": [6, 8, 17, 18], "nation": 1, "natur": [1, 4, 6], "ndarrai": [6, 7, 9, 10], "necessari": [3, 12, 13], "need": [2, 3, 6, 10, 12, 13, 14, 15, 18], "neg": 9, "nest": 18, "network": [4, 6, 8, 17], "neural": [4, 6, 8, 17], "new": [2, 10], "next": [6, 16], "nois": 9, "noisi": [4, 6], "non": [4, 6, 7, 8, 9, 10], "none": [6, 7, 8, 9, 10, 18], "normal": [8, 9], "norwegian": 6, "note": [0, 2, 6, 8, 12, 14, 15, 17], "now": 2, "np": [8, 9, 10, 18], "num_output_channel": 9, "num_sampl": [6, 16], "number": [6, 9, 10, 18], "numpi": [7, 8, 10, 18], "o": 3, "obb": 15, "obj_detect": 14, "object": [6, 7, 10, 15, 18], "objectness_scor": [7, 18], "oblig": 1, "obtain": 18, "occupi": 17, "ocr": [4, 6, 8, 10, 14], "ocr_carea": 18, "ocr_db_crnn": 10, "ocr_lin": 18, "ocr_pag": 18, "ocr_par": 18, "ocr_predictor": [8, 12, 14, 17, 18], "ocrdataset": [6, 16], "ocrmetr": 10, "ocrpredictor": [8, 12], "ocrx_word": 18, "offens": 1, "offici": [1, 8], "offlin": 1, "offset": 9, "onc": 18, "one": [2, 6, 8, 9, 12, 14, 18], "oneof": 9, "ones": [6, 10], "onli": [2, 8, 9, 10, 12, 14, 16, 17, 18], "onlin": 1, "onnx": 15, "onnxruntim": [15, 17], "onnxtr": 17, "opac": 9, "opacity_rang": 9, "open": [1, 2, 14, 17], "opinion": 1, "optic": [4, 18], "optim": [4, 18], "option": [6, 8, 12], "order": [2, 6, 7, 9], "org": [1, 6, 8, 18], "organ": 7, "orient": [1, 7, 8, 11, 15, 18], "orientationpredictor": 8, "other": [1, 2], "otherwis": [1, 7, 10], "our": [2, 8, 18], "out": [2, 8, 9, 10, 18], "outpout": 18, "output": [7, 9, 17], "output_s": [7, 9], "outsid": 13, "over": [6, 10, 18], "overal": [1, 8], "overlai": 7, "overview": 15, "overwrit": 12, "overwritten": 14, "own": 4, "p": [9, 18], "packag": [2, 4, 10, 13, 15, 16, 17], "pad": [6, 8, 9, 18], "page": [3, 6, 8, 10, 12, 18], "page1": 7, "page2": 7, "page_1": 18, "page_idx": [7, 18], "page_orientation_predictor": [8, 12], "page_param": 12, "pair": 10, "paper": 8, "par_1_1": 18, "paragraph": 18, "paragraph_break": 18, "param": [9, 18], "paramet": [4, 7, 8, 17], "pars": [4, 6], "parseq": [4, 8, 14, 17, 18], "part": [6, 9, 18], "parti": 3, "partial": 18, "particip": 1, "pass": [6, 7, 8, 12, 18], "password": 7, "patch": [8, 10], "path": [6, 7, 15, 16, 17], "path_to_checkpoint": 12, "path_to_custom_model": 17, "path_to_pt": 12, "pattern": 1, "pdf": [7, 8, 11], "pdfpage": 7, "peopl": 1, "per": [9, 18], "perform": [4, 7, 8, 9, 10, 13, 17, 18], "period": 1, "permiss": 1, "permut": [4, 8], "persian_lett": 6, "person": [1, 16], "phase": 18, "photo": 16, "physic": [1, 7], "pick": 9, "pictur": 7, "pip": [2, 3, 15, 17], "pipelin": 18, "pixel": [7, 9, 18], "pleas": 2, "plot": 10, "plt": 10, "plug": 14, "plugin": 3, "png": 7, "point": 17, "polici": 13, "polish": 6, "polit": 1, "polygon": [6, 10, 18], "pool": 8, "portugues": 6, "posit": [1, 10], "possibl": [2, 10, 14, 18], "post": [1, 18], "postprocessor": 18, "potenti": 8, "power": 4, "ppageno": 18, "pre": [2, 8, 17], "precis": [10, 18], "pred": 10, "pred_box": 10, "pred_label": 10, "predefin": 16, "predict": [7, 8, 10, 18], "predictor": [4, 7, 8, 11, 12, 14, 17], "prefer": 16, "preinstal": 3, "preprocessor": [12, 18], "prerequisit": 14, "present": 11, "preserv": [8, 9, 18], "preserve_aspect_ratio": [7, 8, 9, 12, 18], "pretrain": [4, 8, 10, 12, 17, 18], "pretrained_backbon": [8, 12], "print": 18, "prior": 6, "privaci": 1, "privat": 1, "probabl": 9, "problem": 2, "procedur": 9, "process": [2, 4, 7, 12, 18], "processor": 18, "produc": [11, 18], "product": 17, "profession": 1, "project": [2, 16], "promptli": 1, "proper": 2, "properli": 6, "provid": [1, 2, 4, 14, 15, 16, 18], "public": [1, 4], "publicli": 18, "publish": 1, "pull": 14, "punctuat": 6, "pure": 6, "purpos": 2, "push_to_hf_hub": [8, 14], "py": 14, "pypdfium2": [3, 7], "pyplot": [7, 10], "python": [2, 15], "python3": 14, "pytorch": [3, 4, 8, 9, 12, 14, 17, 18], "q": 2, "qr": [7, 15], "qr_code": 16, "qualiti": 9, "question": 1, "quickli": 4, "quicktour": 11, "r": 18, "race": 1, "ramdisk": 6, "rand": [8, 9, 10, 17, 18], "random": [8, 9, 10, 18], "randomappli": 9, "randombright": 9, "randomcontrast": 9, "randomcrop": 9, "randomgamma": 9, "randomhorizontalflip": 9, "randomhu": 9, "randomjpegqu": 9, "randomli": 9, "randomres": 9, "randomrot": 9, "randomsatur": 9, "randomshadow": 9, "rang": 9, "rassi": 14, "ratio": [8, 9, 18], "raw": [7, 10], "re": 17, "read": [4, 6, 8], "read_html": 7, "read_img_as_numpi": 7, "read_img_as_tensor": 7, "read_pdf": 7, "readi": 17, "real": [4, 8, 9], "reason": [1, 4, 6], "rebuild": 2, "rebuilt": 2, "recal": [10, 18], "receipt": [4, 6, 18], "reco_arch": [8, 12, 14, 17], "reco_b": 18, "reco_model": [12, 14, 17], "reco_param": 12, "reco_predictor": 12, "recogn": 18, "recognit": [6, 10, 11, 12], "recognition_predictor": [8, 18], "recognition_task": [6, 16], "recognitiondataset": [6, 16], "recognitionpredictor": [8, 12], "rectangular": 8, "reduc": [3, 9], "refer": [2, 3, 12, 14, 15, 16, 18], "regardless": 1, "region": 18, "regroup": 10, "regular": 16, "reject": 1, "rel": [7, 9, 10, 18], "relat": 7, "releas": [0, 3], "relev": 15, "religion": 1, "remov": 1, "render": [7, 18], "repo": 8, "repo_id": [8, 14], "report": 1, "repositori": [6, 8, 14], "repres": [1, 17, 18], "represent": [4, 8], "request": [1, 14], "requir": [3, 9, 17], "research": 4, "residu": 8, "resiz": [9, 18], "resnet": 8, "resnet18": [8, 14], "resnet31": 8, "resnet34": 8, "resnet50": [8, 14], "resolv": 7, "resolve_block": 18, "resolve_lin": 18, "resourc": 16, "respect": 1, "rest": [2, 9, 10], "restrict": 13, "result": [2, 6, 7, 11, 14, 17, 18], "return": 18, "reusabl": 18, "review": 1, "rgb": [7, 9], "rgb_mode": 7, "rgb_output": 7, "right": [1, 8, 10], "robust": [4, 6], "root": 6, "rotat": [6, 7, 8, 9, 10, 11, 12, 16, 18], "run": [2, 3, 8], "same": [2, 7, 10, 16, 17, 18], "sampl": [6, 16, 18], "sample_transform": 6, "sar": [4, 8], "sar_resnet31": [8, 18], "satur": 9, "save": [8, 16], "scale": [7, 8, 9, 10], "scale_rang": 9, "scan": [4, 6], "scene": [4, 6, 8], "score": [7, 10], "script": [2, 16], "seamless": 4, "seamlessli": [4, 18], "search": 8, "searchabl": 11, "sec": 18, "second": 18, "section": [12, 14, 15, 17, 18], "secur": [1, 13], "see": [1, 2], "seen": 18, "segment": [4, 8, 18], "self": 18, "semant": [4, 8], "send": 18, "sens": 10, "sensit": 16, "separ": 18, "sequenc": [4, 6, 7, 8, 10, 18], "sequenti": [9, 18], "seri": 1, "seriou": 1, "set": [1, 3, 6, 8, 10, 13, 15, 18], "set_global_polici": 17, "sever": [7, 9, 18], "sex": 1, "sexual": 1, "shade": 9, "shape": [4, 7, 8, 9, 10, 18], "share": [13, 16], "shift": 9, "shm": 13, "should": [2, 6, 7, 9, 10], "show": [4, 7, 8, 10, 12, 14, 15], "showcas": [2, 11], "shuffl": [6, 9], "side": 10, "signatur": 7, "signific": 16, "simpl": [4, 8, 17], "simpler": 8, "sinc": [6, 16], "singl": [1, 2, 4, 6], "single_img_doc": 17, "size": [1, 6, 7, 9, 15, 18], "skew": 18, "slack": 2, "slightli": 8, "small": [2, 8, 18], "smallest": 7, "snapshot_download": 8, "snippet": 18, "so": [2, 3, 6, 8, 14, 16], "social": 1, "socio": 1, "some": [3, 11, 14, 16], "someth": 2, "somewher": 2, "sort": 1, "sourc": [6, 7, 8, 9, 10, 14], "space": [1, 18], "span": 18, "spanish": 6, "spatial": [4, 6, 7], "specif": [2, 3, 10, 12, 16, 18], "specifi": [1, 6, 7], "speed": [4, 8, 18], "sphinx": 2, "sroie": [4, 6, 16], "stabl": 3, "stackoverflow": 2, "stage": 4, "standalon": 11, "standard": 9, "start": 6, "state": [4, 10, 15], "static": 10, "statu": 1, "std": [9, 12], "step": 13, "still": 18, "str": [6, 7, 8, 9, 10], "straight": [6, 8, 16, 18], "straighten": 18, "straighten_pag": [8, 12, 18], "straigten_pag": 12, "stream": 7, "street": [4, 6], "strict": 3, "strictli": 10, "string": [6, 7, 10, 18], "strive": 3, "strong": [4, 8], "structur": [17, 18], "subset": [6, 18], "suggest": [2, 14], "sum": 10, "summari": 10, "support": [3, 12, 15, 17, 18], "sustain": 1, "svhn": [4, 6, 16], "svt": [6, 16], "swedish": 6, "symmetr": [8, 9, 18], "symmetric_pad": [8, 9, 18], "synthet": 4, "synthtext": [4, 6, 16], "system": 18, "t": [2, 6, 12, 17, 18], "tabl": [14, 15, 16], "take": [1, 6, 18], "target": [6, 7, 9, 10, 16], "target_s": 6, "task": [4, 6, 8, 14, 16, 18], "task2": 6, "team": 3, "techminde": 3, "templat": [2, 4], "tensor": [6, 7, 9, 18], "tensorflow": [3, 4, 7, 8, 9, 12, 14, 17, 18], "tensorspec": 17, "term": 1, "test": [6, 16], "test_set": 6, "text": [6, 7, 8, 10, 16], "text_output": 18, "textmatch": 10, "textnet": 8, "textnet_bas": 8, "textnet_smal": 8, "textnet_tini": 8, "textract": [4, 18], "textstylebrush": [4, 6], "textual": [4, 6, 7, 8, 18], "tf": [3, 7, 8, 9, 14, 17], "than": [2, 10, 14], "thank": 2, "thei": [1, 10], "them": [6, 18], "thi": [1, 2, 3, 5, 6, 9, 10, 12, 13, 14, 16, 17, 18], "thing": [17, 18], "third": 3, "those": [1, 7, 18], "threaten": 1, "threshold": 18, "through": [1, 9, 15, 16], "tilman": 14, "time": [1, 4, 8, 10, 16], "tini": 8, "titl": [7, 18], "tm": 18, "tmp": 13, "togeth": [2, 7], "tograi": 9, "tool": 16, "top": [10, 17, 18], "topic": 2, "torch": [3, 9, 12, 14, 17], "torchvis": 9, "total": 12, "toward": [1, 3], "train": [2, 6, 8, 9, 14, 15, 16, 17, 18], "train_it": [6, 16], "train_load": [6, 16], "train_pytorch": 14, "train_set": [6, 16], "train_tensorflow": 14, "trainabl": [4, 8], "tranform": 9, "transcrib": 18, "transfer": [4, 6], "transfo": 9, "transform": [4, 6, 8], "translat": 1, "troll": 1, "true": [6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18], "truth": 10, "tune": 17, "tupl": [6, 7, 9, 10], "two": [7, 13], "txt": 6, "type": [7, 10, 14, 17, 18], "typic": 18, "u": [1, 2], "ucsd": 6, "udac": 2, "uint8": [7, 8, 10, 18], "ukrainian": 6, "unaccept": 1, "underli": [16, 18], "underneath": 7, "understand": [4, 6, 18], "uniform": [8, 9], "uniformli": 9, "uninterrupt": [7, 18], "union": 10, "unittest": 2, "unlock": 7, "unoffici": 8, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [8, 18], "updat": 10, "upgrad": 2, "upper": [6, 9], "uppercas": 16, "url": 7, "us": [1, 2, 3, 6, 8, 10, 11, 12, 13, 14, 15, 18], "usabl": 18, "usag": [13, 17], "use_polygon": [6, 10, 16], "useabl": 18, "user": [4, 7, 11], "utf": 18, "util": 17, "v1": 14, "v3": [8, 14, 18], "valid": 16, "valu": [2, 7, 9, 18], "valuabl": 4, "variabl": 13, "varieti": 6, "veri": 8, "version": [1, 2, 3, 17, 18], "vgg": 8, "vgg16": 14, "vgg16_bn_r": 8, "via": 1, "vietnames": 6, "view": [4, 6], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 6, 8], "visiondataset": 6, "visiontransform": 8, "visual": [3, 4, 15], "visualize_pag": 10, "vit_": 8, "vit_b": 8, "vitstr": [4, 8, 17], "vitstr_bas": [8, 18], "vitstr_smal": [8, 12, 17, 18], "viz": 3, "vocab": [12, 14, 16, 17, 18], "vocabulari": [6, 12, 14], "w": [7, 8, 9, 10], "w3": 18, "wa": 1, "wai": [1, 4, 16], "want": [2, 17, 18], "warmup": 18, "wasn": 2, "we": [1, 2, 3, 4, 7, 9, 12, 14, 16, 17, 18], "weasyprint": 7, "web": [2, 7], "websit": 6, "welcom": 1, "well": [1, 17], "were": [1, 7, 18], "what": 1, "when": [1, 2, 8], "whenev": 2, "where": [2, 7, 9, 10], "whether": [2, 6, 7, 9, 10, 16, 18], "which": [1, 8, 13, 15, 16, 18], "whichev": 3, "while": [9, 18], "why": 1, "width": [7, 9], "wiki": 1, "wildreceipt": [4, 6, 16], "window": [8, 10], "wish": 2, "within": 1, "without": [1, 6, 8], "wonder": 2, "word": [4, 6, 8, 10, 18], "word_1_1": 18, "word_1_2": 18, "word_1_3": 18, "wordgener": [6, 16], "words_onli": 10, "work": [12, 13, 18], "workflow": 2, "worklow": 2, "world": [10, 18], "worth": 8, "wrap": 18, "wrapper": [6, 9], "write": 13, "written": [1, 7], "www": [1, 7, 18], "x": [7, 9, 10], "x_ascend": 18, "x_descend": 18, "x_i": 10, "x_size": 18, "x_wconf": 18, "xhtml": 18, "xmax": 7, "xmin": 7, "xml": 18, "xml_bytes_str": 18, "xml_element": 18, "xml_output": 18, "xmln": 18, "y": 10, "y_i": 10, "y_j": 10, "yet": 15, "ymax": 7, "ymin": 7, "yolov8": 15, "you": [2, 3, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18], "your": [2, 4, 7, 10, 18], "yoursit": 7, "zero": [9, 10], "zoo": 12, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 6, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 6, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 6, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 6, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 6, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 6, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 6, "\u00e4\u00f6\u00e4\u00f6": 6, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 6, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 6, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 6, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 6, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 6, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 6, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 6, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 6, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 6, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 6, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 6, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 6, "\u067e\u0686\u06a2\u06a4\u06af": 6, "\u0905": 6, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 6, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 6, "\u0950": 6, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 6, "\u09bd": 6, "\u09ce": 6, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 6}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 18, "approach": 18, "architectur": 18, "arg": [6, 7, 8, 9, 10], "artefact": 7, "artefactdetect": 15, "attribut": 1, "avail": [15, 16, 18], "aw": 13, "ban": 1, "block": 7, "bug": 2, "changelog": 0, "choos": [16, 18], "classif": [8, 12, 14], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 14, "compos": 9, "conda": 3, "conduct": 1, "connect": 2, "continu": 2, "contrib": 5, "contribut": [2, 5, 15], "contributor": 1, "convent": 14, "correct": 1, "coven": 1, "custom": [6, 12], "data": 16, "dataload": 6, "dataset": [4, 6, 16], "detect": [4, 8, 14, 16, 18], "develop": 2, "do": 18, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 11], "document": [2, 4, 7], "end": 18, "enforc": 1, "evalu": 10, "export": 17, "factori": 8, "featur": [2, 4], "feedback": 2, "file": 7, "from": 14, "gener": [6, 16], "git": 3, "guidelin": 1, "half": 17, "hub": 14, "huggingfac": 14, "i": 18, "infer": 17, "instal": [2, 3], "integr": [2, 15], "io": 7, "lambda": 13, "let": 2, "line": 7, "linux": 3, "load": [12, 14, 16], "loader": 6, "main": 4, "mode": 2, "model": [4, 8, 12, 14, 17, 18], "modifi": 2, "modul": [5, 15], "name": 14, "notebook": 11, "object": 16, "ocr": [16, 18], "onli": 3, "onnx": 17, "optim": 17, "option": 18, "orient": 12, "our": 1, "output": 18, "own": [12, 16], "packag": 3, "page": 7, "perman": 1, "pipelin": 15, "pledg": 1, "precis": 17, "predictor": 18, "prepar": 17, "prerequisit": 3, "pretrain": 14, "push": 14, "python": 3, "qualiti": 2, "question": 2, "read": 7, "readi": 16, "recognit": [4, 8, 14, 16, 18], "report": 2, "request": 2, "respons": 1, "return": [6, 7, 8, 10], "right": 18, "scope": 1, "share": 14, "should": 18, "stage": 18, "standard": 1, "structur": [2, 7], "style": 2, "support": [4, 5, 6, 9], "synthet": [6, 16], "task": 10, "temporari": 1, "test": 2, "text": [4, 18], "train": 12, "transform": 9, "two": 18, "unit": 2, "us": [16, 17], "util": 10, "v0": 0, "verif": 2, "via": 3, "visual": 10, "vocab": 6, "warn": 1, "what": 18, "word": 7, "your": [12, 14, 15, 16, 17], "zoo": [4, 8]}}) \ No newline at end of file diff --git a/using_doctr/custom_models_training.html b/using_doctr/custom_models_training.html index 580b4368b7..e664c6a950 100644 --- a/using_doctr/custom_models_training.html +++ b/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -615,7 +615,7 @@

Loading your custom trained orientation classification model - + diff --git a/using_doctr/running_on_aws.html b/using_doctr/running_on_aws.html index ddb0c3c80f..81c38b49f5 100644 --- a/using_doctr/running_on_aws.html +++ b/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -358,7 +358,7 @@

AWS Lambda - + diff --git a/using_doctr/sharing_models.html b/using_doctr/sharing_models.html index 07a3b2f2a3..4f5d1d68a5 100644 --- a/using_doctr/sharing_models.html +++ b/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -540,7 +540,7 @@

Recognition - + diff --git a/using_doctr/using_contrib_modules.html b/using_doctr/using_contrib_modules.html index b4a10925e6..cf282ff3a4 100644 --- a/using_doctr/using_contrib_modules.html +++ b/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -411,7 +411,7 @@

ArtefactDetection - + diff --git a/using_doctr/using_datasets.html b/using_doctr/using_datasets.html index 4a52df36ba..e30b6d6459 100644 --- a/using_doctr/using_datasets.html +++ b/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -638,7 +638,7 @@

Data Loading - + diff --git a/using_doctr/using_model_export.html b/using_doctr/using_model_export.html index 2b30ee63a1..ad9d09ed4c 100644 --- a/using_doctr/using_model_export.html +++ b/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -463,7 +463,7 @@

Using your ONNX exported model - + diff --git a/using_doctr/using_models.html b/using_doctr/using_models.html index 13cb06116b..5c80dbf62d 100644 --- a/using_doctr/using_models.html +++ b/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1249,7 +1249,7 @@

Advanced options - + diff --git a/v0.1.0/_modules/doctr/datasets/cord.html b/v0.1.0/_modules/doctr/datasets/cord.html index 78e70014e3..55b0584830 100644 --- a/v0.1.0/_modules/doctr/datasets/cord.html +++ b/v0.1.0/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -462,7 +462,7 @@

Source code for doctr.datasets.cord

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/detection.html b/v0.1.0/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/v0.1.0/_modules/doctr/datasets/detection.html +++ b/v0.1.0/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/doc_artefacts.html b/v0.1.0/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/v0.1.0/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.1.0/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.1.0/_modules/doctr/datasets/funsd.html b/v0.1.0/_modules/doctr/datasets/funsd.html index e52abc5428..f08612f9fa 100644 --- a/v0.1.0/_modules/doctr/datasets/funsd.html +++ b/v0.1.0/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.funsd

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/generator/tensorflow.html b/v0.1.0/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/v0.1.0/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.1.0/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/datasets/ic03.html b/v0.1.0/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/v0.1.0/_modules/doctr/datasets/ic03.html +++ b/v0.1.0/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/ic13.html b/v0.1.0/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/v0.1.0/_modules/doctr/datasets/ic13.html +++ b/v0.1.0/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/iiit5k.html b/v0.1.0/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/v0.1.0/_modules/doctr/datasets/iiit5k.html +++ b/v0.1.0/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/iiithws.html b/v0.1.0/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/v0.1.0/_modules/doctr/datasets/iiithws.html +++ b/v0.1.0/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/imgur5k.html b/v0.1.0/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/v0.1.0/_modules/doctr/datasets/imgur5k.html +++ b/v0.1.0/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/loader.html b/v0.1.0/_modules/doctr/datasets/loader.html index d1785caa1c..ed80350ef0 100644 --- a/v0.1.0/_modules/doctr/datasets/loader.html +++ b/v0.1.0/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -429,7 +429,7 @@

Source code for doctr.datasets.loader

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/mjsynth.html b/v0.1.0/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/v0.1.0/_modules/doctr/datasets/mjsynth.html +++ b/v0.1.0/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/ocr.html b/v0.1.0/_modules/doctr/datasets/ocr.html index 5832933ea5..ce1ed8b0d4 100644 --- a/v0.1.0/_modules/doctr/datasets/ocr.html +++ b/v0.1.0/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -403,7 +403,7 @@

Source code for doctr.datasets.ocr

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/recognition.html b/v0.1.0/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/v0.1.0/_modules/doctr/datasets/recognition.html +++ b/v0.1.0/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/sroie.html b/v0.1.0/_modules/doctr/datasets/sroie.html index 94c963390e..04cf10bda2 100644 --- a/v0.1.0/_modules/doctr/datasets/sroie.html +++ b/v0.1.0/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.sroie

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/svhn.html b/v0.1.0/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/v0.1.0/_modules/doctr/datasets/svhn.html +++ b/v0.1.0/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/svt.html b/v0.1.0/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/v0.1.0/_modules/doctr/datasets/svt.html +++ b/v0.1.0/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/synthtext.html b/v0.1.0/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/v0.1.0/_modules/doctr/datasets/synthtext.html +++ b/v0.1.0/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/utils.html b/v0.1.0/_modules/doctr/datasets/utils.html index 9defb17ba5..bde9304597 100644 --- a/v0.1.0/_modules/doctr/datasets/utils.html +++ b/v0.1.0/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -554,7 +554,7 @@

Source code for doctr.datasets.utils

     
   
- + diff --git a/v0.1.0/_modules/doctr/datasets/wildreceipt.html b/v0.1.0/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.1.0/_modules/doctr/datasets/wildreceipt.html +++ b/v0.1.0/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.1.0/_modules/doctr/io/elements.html b/v0.1.0/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/v0.1.0/_modules/doctr/io/elements.html +++ b/v0.1.0/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.1.0/_modules/doctr/io/html.html b/v0.1.0/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/v0.1.0/_modules/doctr/io/html.html +++ b/v0.1.0/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.1.0/_modules/doctr/io/image/base.html b/v0.1.0/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/v0.1.0/_modules/doctr/io/image/base.html +++ b/v0.1.0/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.1.0/_modules/doctr/io/image/tensorflow.html b/v0.1.0/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/v0.1.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.1.0/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.1.0/_modules/doctr/io/pdf.html b/v0.1.0/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/v0.1.0/_modules/doctr/io/pdf.html +++ b/v0.1.0/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.1.0/_modules/doctr/io/reader.html b/v0.1.0/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/v0.1.0/_modules/doctr/io/reader.html +++ b/v0.1.0/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.1.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/v0.1.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/v0.1.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/v0.1.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.1.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/v0.1.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/v0.1.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/classification/zoo.html b/v0.1.0/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/v0.1.0/_modules/doctr/models/classification/zoo.html +++ b/v0.1.0/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.1.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.1.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 4325d0b74a..66cef8663d 100644 --- a/v0.1.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -759,7 +759,7 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo

- + diff --git a/v0.1.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.1.0/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.1.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.1.0/_modules/doctr/models/detection/linknet/tensorflow.html index dbb58e37cf..ce995f99d4 100644 --- a/v0.1.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -716,7 +716,7 @@

Source code for doctr.models.detection.linknet.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/detection/zoo.html b/v0.1.0/_modules/doctr/models/detection/zoo.html index 312f4584ab..3651c4e2d3 100644 --- a/v0.1.0/_modules/doctr/models/detection/zoo.html +++ b/v0.1.0/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -450,7 +450,7 @@

Source code for doctr.models.detection.zoo

     
   
- + diff --git a/v0.1.0/_modules/doctr/models/factory/hub.html b/v0.1.0/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/v0.1.0/_modules/doctr/models/factory/hub.html +++ b/v0.1.0/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.1.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.1.0/_modules/doctr/models/recognition/crnn/tensorflow.html index e50c245923..bc64da9a1b 100644 --- a/v0.1.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -658,7 +658,7 @@

Source code for doctr.models.recognition.crnn.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.1.0/_modules/doctr/models/recognition/master/tensorflow.html index 152ebb7e59..aa6aa69325 100644 --- a/v0.1.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -655,7 +655,7 @@

Source code for doctr.models.recognition.master.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.1.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/v0.1.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.1.0/_modules/doctr/models/recognition/sar/tensorflow.html index 010bc2bc54..4a591e6451 100644 --- a/v0.1.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -757,7 +757,7 @@

Source code for doctr.models.recognition.sar.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.1.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/v0.1.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/models/recognition/zoo.html b/v0.1.0/_modules/doctr/models/recognition/zoo.html index 2c47f88de4..f664304019 100644 --- a/v0.1.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.1.0/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -415,7 +415,7 @@

Source code for doctr.models.recognition.zoo

   
- + diff --git a/v0.1.0/_modules/doctr/models/zoo.html b/v0.1.0/_modules/doctr/models/zoo.html index 5b22f2c79f..d459671648 100644 --- a/v0.1.0/_modules/doctr/models/zoo.html +++ b/v0.1.0/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -576,7 +576,7 @@

Source code for doctr.models.zoo

     
   
- + diff --git a/v0.1.0/_modules/doctr/transforms/modules/base.html b/v0.1.0/_modules/doctr/transforms/modules/base.html index 96ebd680b7..4596df3848 100644 --- a/v0.1.0/_modules/doctr/transforms/modules/base.html +++ b/v0.1.0/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -643,7 +643,7 @@

Source code for doctr.transforms.modules.base

- + diff --git a/v0.1.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.1.0/_modules/doctr/transforms/modules/tensorflow.html index 0e18bcc922..acbbe96225 100644 --- a/v0.1.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.1.0/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -956,7 +956,7 @@

Source code for doctr.transforms.modules.tensorflow

- + diff --git a/v0.1.0/_modules/doctr/utils/metrics.html b/v0.1.0/_modules/doctr/utils/metrics.html index d35d7e9672..8a37d5949a 100644 --- a/v0.1.0/_modules/doctr/utils/metrics.html +++ b/v0.1.0/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -936,7 +936,7 @@

Source code for doctr.utils.metrics

     
   
- + diff --git a/v0.1.0/_modules/doctr/utils/visualization.html b/v0.1.0/_modules/doctr/utils/visualization.html index e608d492a4..c818be6d7b 100644 --- a/v0.1.0/_modules/doctr/utils/visualization.html +++ b/v0.1.0/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -720,7 +720,7 @@

Source code for doctr.utils.visualization

     
   
- + diff --git a/v0.1.0/_modules/index.html b/v0.1.0/_modules/index.html index 758ef41bd0..5793c44f20 100644 --- a/v0.1.0/_modules/index.html +++ b/v0.1.0/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -378,7 +378,7 @@

All modules for which code is available

- + diff --git a/v0.1.0/_sources/getting_started/installing.rst.txt b/v0.1.0/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/v0.1.0/_sources/getting_started/installing.rst.txt +++ b/v0.1.0/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/v0.1.0/_static/basic.css b/v0.1.0/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.1.0/_static/basic.css +++ b/v0.1.0/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.1.0/_static/doctools.js b/v0.1.0/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.1.0/_static/doctools.js +++ b/v0.1.0/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.1.0/_static/language_data.js b/v0.1.0/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.1.0/_static/language_data.js +++ b/v0.1.0/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.1.0/_static/searchtools.js b/v0.1.0/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.1.0/_static/searchtools.js +++ b/v0.1.0/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.1.0/changelog.html b/v0.1.0/changelog.html index ac81a6f231..fc45a50384 100644 --- a/v0.1.0/changelog.html +++ b/v0.1.0/changelog.html @@ -14,7 +14,7 @@ - + Changelog - docTR documentation @@ -446,7 +446,7 @@

v0.1.0 (2021-03-05) - + diff --git a/v0.1.0/community/resources.html b/v0.1.0/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.1.0/community/resources.html +++ b/v0.1.0/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.1.0/contributing/code_of_conduct.html b/v0.1.0/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/v0.1.0/contributing/code_of_conduct.html +++ b/v0.1.0/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/v0.1.0/contributing/contributing.html b/v0.1.0/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/v0.1.0/contributing/contributing.html +++ b/v0.1.0/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/v0.1.0/genindex.html b/v0.1.0/genindex.html index cbb43f08d8..21520455b4 100644 --- a/v0.1.0/genindex.html +++ b/v0.1.0/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -756,7 +756,7 @@

W

- + diff --git a/v0.1.0/getting_started/installing.html b/v0.1.0/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/v0.1.0/getting_started/installing.html +++ b/v0.1.0/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/v0.1.0/index.html b/v0.1.0/index.html index 76509686f5..3a06afc6d9 100644 --- a/v0.1.0/index.html +++ b/v0.1.0/index.html @@ -14,7 +14,7 @@ - + docTR documentation @@ -445,7 +445,7 @@

Supported datasets - + diff --git a/v0.1.0/modules/contrib.html b/v0.1.0/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.1.0/modules/contrib.html +++ b/v0.1.0/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.1.0/modules/datasets.html b/v0.1.0/modules/datasets.html index 456e10b172..380a986793 100644 --- a/v0.1.0/modules/datasets.html +++ b/v0.1.0/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1081,7 +1081,7 @@

Returns: - + diff --git a/v0.1.0/modules/io.html b/v0.1.0/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/v0.1.0/modules/io.html +++ b/v0.1.0/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/v0.1.0/modules/models.html b/v0.1.0/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/v0.1.0/modules/models.html +++ b/v0.1.0/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/v0.1.0/modules/transforms.html b/v0.1.0/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/v0.1.0/modules/transforms.html +++ b/v0.1.0/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/v0.1.0/modules/utils.html b/v0.1.0/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/v0.1.0/modules/utils.html +++ b/v0.1.0/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/v0.1.0/notebooks.html b/v0.1.0/notebooks.html index f97771aebb..d36539f59e 100644 --- a/v0.1.0/notebooks.html +++ b/v0.1.0/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/v0.1.0/search.html b/v0.1.0/search.html index 82b8bd6950..d050f5eac7 100644 --- a/v0.1.0/search.html +++ b/v0.1.0/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -340,7 +340,7 @@ - + diff --git a/v0.1.0/searchindex.js b/v0.1.0/searchindex.js index bfa546d0e9..6f154115ab 100644 --- a/v0.1.0/searchindex.js +++ b/v0.1.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [4, 10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/v0.1.0/using_doctr/custom_models_training.html b/v0.1.0/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/v0.1.0/using_doctr/custom_models_training.html +++ b/v0.1.0/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/v0.1.0/using_doctr/running_on_aws.html b/v0.1.0/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/v0.1.0/using_doctr/running_on_aws.html +++ b/v0.1.0/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/v0.1.0/using_doctr/sharing_models.html b/v0.1.0/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/v0.1.0/using_doctr/sharing_models.html +++ b/v0.1.0/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/v0.1.0/using_doctr/using_contrib_modules.html b/v0.1.0/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.1.0/using_doctr/using_contrib_modules.html +++ b/v0.1.0/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.1.0/using_doctr/using_datasets.html b/v0.1.0/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/v0.1.0/using_doctr/using_datasets.html +++ b/v0.1.0/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/v0.1.0/using_doctr/using_model_export.html b/v0.1.0/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/v0.1.0/using_doctr/using_model_export.html +++ b/v0.1.0/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/v0.1.0/using_doctr/using_models.html b/v0.1.0/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/v0.1.0/using_doctr/using_models.html +++ b/v0.1.0/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/v0.1.1/_modules/doctr/datasets/cord.html b/v0.1.1/_modules/doctr/datasets/cord.html index 78e70014e3..55b0584830 100644 --- a/v0.1.1/_modules/doctr/datasets/cord.html +++ b/v0.1.1/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -462,7 +462,7 @@

Source code for doctr.datasets.cord

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/detection.html b/v0.1.1/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/v0.1.1/_modules/doctr/datasets/detection.html +++ b/v0.1.1/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/doc_artefacts.html b/v0.1.1/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/v0.1.1/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.1.1/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.1.1/_modules/doctr/datasets/funsd.html b/v0.1.1/_modules/doctr/datasets/funsd.html index e52abc5428..f08612f9fa 100644 --- a/v0.1.1/_modules/doctr/datasets/funsd.html +++ b/v0.1.1/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.funsd

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/generator/tensorflow.html b/v0.1.1/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/v0.1.1/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.1.1/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/datasets/ic03.html b/v0.1.1/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/v0.1.1/_modules/doctr/datasets/ic03.html +++ b/v0.1.1/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/ic13.html b/v0.1.1/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/v0.1.1/_modules/doctr/datasets/ic13.html +++ b/v0.1.1/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/iiit5k.html b/v0.1.1/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/v0.1.1/_modules/doctr/datasets/iiit5k.html +++ b/v0.1.1/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/iiithws.html b/v0.1.1/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/v0.1.1/_modules/doctr/datasets/iiithws.html +++ b/v0.1.1/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/imgur5k.html b/v0.1.1/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/v0.1.1/_modules/doctr/datasets/imgur5k.html +++ b/v0.1.1/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/loader.html b/v0.1.1/_modules/doctr/datasets/loader.html index d1785caa1c..ed80350ef0 100644 --- a/v0.1.1/_modules/doctr/datasets/loader.html +++ b/v0.1.1/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -429,7 +429,7 @@

Source code for doctr.datasets.loader

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/mjsynth.html b/v0.1.1/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/v0.1.1/_modules/doctr/datasets/mjsynth.html +++ b/v0.1.1/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/ocr.html b/v0.1.1/_modules/doctr/datasets/ocr.html index 5832933ea5..ce1ed8b0d4 100644 --- a/v0.1.1/_modules/doctr/datasets/ocr.html +++ b/v0.1.1/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -403,7 +403,7 @@

Source code for doctr.datasets.ocr

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/recognition.html b/v0.1.1/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/v0.1.1/_modules/doctr/datasets/recognition.html +++ b/v0.1.1/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/sroie.html b/v0.1.1/_modules/doctr/datasets/sroie.html index 94c963390e..04cf10bda2 100644 --- a/v0.1.1/_modules/doctr/datasets/sroie.html +++ b/v0.1.1/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.sroie

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/svhn.html b/v0.1.1/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/v0.1.1/_modules/doctr/datasets/svhn.html +++ b/v0.1.1/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/svt.html b/v0.1.1/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/v0.1.1/_modules/doctr/datasets/svt.html +++ b/v0.1.1/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/synthtext.html b/v0.1.1/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/v0.1.1/_modules/doctr/datasets/synthtext.html +++ b/v0.1.1/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/utils.html b/v0.1.1/_modules/doctr/datasets/utils.html index 9defb17ba5..bde9304597 100644 --- a/v0.1.1/_modules/doctr/datasets/utils.html +++ b/v0.1.1/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -554,7 +554,7 @@

Source code for doctr.datasets.utils

     
   
- + diff --git a/v0.1.1/_modules/doctr/datasets/wildreceipt.html b/v0.1.1/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.1.1/_modules/doctr/datasets/wildreceipt.html +++ b/v0.1.1/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.1.1/_modules/doctr/io/elements.html b/v0.1.1/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/v0.1.1/_modules/doctr/io/elements.html +++ b/v0.1.1/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.1.1/_modules/doctr/io/html.html b/v0.1.1/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/v0.1.1/_modules/doctr/io/html.html +++ b/v0.1.1/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.1.1/_modules/doctr/io/image/base.html b/v0.1.1/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/v0.1.1/_modules/doctr/io/image/base.html +++ b/v0.1.1/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.1.1/_modules/doctr/io/image/tensorflow.html b/v0.1.1/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/v0.1.1/_modules/doctr/io/image/tensorflow.html +++ b/v0.1.1/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.1.1/_modules/doctr/io/pdf.html b/v0.1.1/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/v0.1.1/_modules/doctr/io/pdf.html +++ b/v0.1.1/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.1.1/_modules/doctr/io/reader.html b/v0.1.1/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/v0.1.1/_modules/doctr/io/reader.html +++ b/v0.1.1/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.1.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/v0.1.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/v0.1.1/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/v0.1.1/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.1.1/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/v0.1.1/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/classification/vit/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/v0.1.1/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/classification/zoo.html b/v0.1.1/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/v0.1.1/_modules/doctr/models/classification/zoo.html +++ b/v0.1.1/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.1.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.1.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 4325d0b74a..66cef8663d 100644 --- a/v0.1.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -759,7 +759,7 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo

- + diff --git a/v0.1.1/_modules/doctr/models/detection/fast/tensorflow.html b/v0.1.1/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.1.1/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.1.1/_modules/doctr/models/detection/linknet/tensorflow.html index dbb58e37cf..ce995f99d4 100644 --- a/v0.1.1/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -716,7 +716,7 @@

Source code for doctr.models.detection.linknet.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/detection/zoo.html b/v0.1.1/_modules/doctr/models/detection/zoo.html index 312f4584ab..3651c4e2d3 100644 --- a/v0.1.1/_modules/doctr/models/detection/zoo.html +++ b/v0.1.1/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -450,7 +450,7 @@

Source code for doctr.models.detection.zoo

     
   
- + diff --git a/v0.1.1/_modules/doctr/models/factory/hub.html b/v0.1.1/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/v0.1.1/_modules/doctr/models/factory/hub.html +++ b/v0.1.1/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.1.1/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.1.1/_modules/doctr/models/recognition/crnn/tensorflow.html index e50c245923..bc64da9a1b 100644 --- a/v0.1.1/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -658,7 +658,7 @@

Source code for doctr.models.recognition.crnn.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/recognition/master/tensorflow.html b/v0.1.1/_modules/doctr/models/recognition/master/tensorflow.html index 152ebb7e59..aa6aa69325 100644 --- a/v0.1.1/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -655,7 +655,7 @@

Source code for doctr.models.recognition.master.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.1.1/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/v0.1.1/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.1.1/_modules/doctr/models/recognition/sar/tensorflow.html index 010bc2bc54..4a591e6451 100644 --- a/v0.1.1/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -757,7 +757,7 @@

Source code for doctr.models.recognition.sar.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.1.1/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/v0.1.1/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/models/recognition/zoo.html b/v0.1.1/_modules/doctr/models/recognition/zoo.html index 2c47f88de4..f664304019 100644 --- a/v0.1.1/_modules/doctr/models/recognition/zoo.html +++ b/v0.1.1/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -415,7 +415,7 @@

Source code for doctr.models.recognition.zoo

   
- + diff --git a/v0.1.1/_modules/doctr/models/zoo.html b/v0.1.1/_modules/doctr/models/zoo.html index 5b22f2c79f..d459671648 100644 --- a/v0.1.1/_modules/doctr/models/zoo.html +++ b/v0.1.1/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -576,7 +576,7 @@

Source code for doctr.models.zoo

     
   
- + diff --git a/v0.1.1/_modules/doctr/transforms/modules/base.html b/v0.1.1/_modules/doctr/transforms/modules/base.html index 96ebd680b7..4596df3848 100644 --- a/v0.1.1/_modules/doctr/transforms/modules/base.html +++ b/v0.1.1/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -643,7 +643,7 @@

Source code for doctr.transforms.modules.base

- + diff --git a/v0.1.1/_modules/doctr/transforms/modules/tensorflow.html b/v0.1.1/_modules/doctr/transforms/modules/tensorflow.html index 0e18bcc922..acbbe96225 100644 --- a/v0.1.1/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.1.1/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -956,7 +956,7 @@

Source code for doctr.transforms.modules.tensorflow

- + diff --git a/v0.1.1/_modules/doctr/utils/metrics.html b/v0.1.1/_modules/doctr/utils/metrics.html index d35d7e9672..8a37d5949a 100644 --- a/v0.1.1/_modules/doctr/utils/metrics.html +++ b/v0.1.1/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -936,7 +936,7 @@

Source code for doctr.utils.metrics

     
   
- + diff --git a/v0.1.1/_modules/doctr/utils/visualization.html b/v0.1.1/_modules/doctr/utils/visualization.html index e608d492a4..c818be6d7b 100644 --- a/v0.1.1/_modules/doctr/utils/visualization.html +++ b/v0.1.1/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -720,7 +720,7 @@

Source code for doctr.utils.visualization

     
   
- + diff --git a/v0.1.1/_modules/index.html b/v0.1.1/_modules/index.html index 758ef41bd0..5793c44f20 100644 --- a/v0.1.1/_modules/index.html +++ b/v0.1.1/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -378,7 +378,7 @@

All modules for which code is available

- + diff --git a/v0.1.1/_sources/getting_started/installing.rst.txt b/v0.1.1/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/v0.1.1/_sources/getting_started/installing.rst.txt +++ b/v0.1.1/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/v0.1.1/_static/basic.css b/v0.1.1/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.1.1/_static/basic.css +++ b/v0.1.1/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.1.1/_static/doctools.js b/v0.1.1/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.1.1/_static/doctools.js +++ b/v0.1.1/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.1.1/_static/language_data.js b/v0.1.1/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.1.1/_static/language_data.js +++ b/v0.1.1/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.1.1/_static/searchtools.js b/v0.1.1/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.1.1/_static/searchtools.js +++ b/v0.1.1/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.1.1/changelog.html b/v0.1.1/changelog.html index ac81a6f231..fc45a50384 100644 --- a/v0.1.1/changelog.html +++ b/v0.1.1/changelog.html @@ -14,7 +14,7 @@ - + Changelog - docTR documentation @@ -446,7 +446,7 @@

v0.1.0 (2021-03-05) - + diff --git a/v0.1.1/community/resources.html b/v0.1.1/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.1.1/community/resources.html +++ b/v0.1.1/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.1.1/contributing/code_of_conduct.html b/v0.1.1/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/v0.1.1/contributing/code_of_conduct.html +++ b/v0.1.1/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/v0.1.1/contributing/contributing.html b/v0.1.1/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/v0.1.1/contributing/contributing.html +++ b/v0.1.1/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/v0.1.1/genindex.html b/v0.1.1/genindex.html index cbb43f08d8..21520455b4 100644 --- a/v0.1.1/genindex.html +++ b/v0.1.1/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -756,7 +756,7 @@

W

- + diff --git a/v0.1.1/getting_started/installing.html b/v0.1.1/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/v0.1.1/getting_started/installing.html +++ b/v0.1.1/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/v0.1.1/index.html b/v0.1.1/index.html index 76509686f5..3a06afc6d9 100644 --- a/v0.1.1/index.html +++ b/v0.1.1/index.html @@ -14,7 +14,7 @@ - + docTR documentation @@ -445,7 +445,7 @@

Supported datasets - + diff --git a/v0.1.1/modules/contrib.html b/v0.1.1/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.1.1/modules/contrib.html +++ b/v0.1.1/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.1.1/modules/datasets.html b/v0.1.1/modules/datasets.html index 456e10b172..380a986793 100644 --- a/v0.1.1/modules/datasets.html +++ b/v0.1.1/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1081,7 +1081,7 @@

Returns: - + diff --git a/v0.1.1/modules/io.html b/v0.1.1/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/v0.1.1/modules/io.html +++ b/v0.1.1/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/v0.1.1/modules/models.html b/v0.1.1/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/v0.1.1/modules/models.html +++ b/v0.1.1/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/v0.1.1/modules/transforms.html b/v0.1.1/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/v0.1.1/modules/transforms.html +++ b/v0.1.1/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/v0.1.1/modules/utils.html b/v0.1.1/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/v0.1.1/modules/utils.html +++ b/v0.1.1/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/v0.1.1/notebooks.html b/v0.1.1/notebooks.html index f97771aebb..d36539f59e 100644 --- a/v0.1.1/notebooks.html +++ b/v0.1.1/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/v0.1.1/search.html b/v0.1.1/search.html index 82b8bd6950..d050f5eac7 100644 --- a/v0.1.1/search.html +++ b/v0.1.1/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -340,7 +340,7 @@ - + diff --git a/v0.1.1/searchindex.js b/v0.1.1/searchindex.js index bfa546d0e9..6f154115ab 100644 --- a/v0.1.1/searchindex.js +++ b/v0.1.1/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [4, 10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/v0.1.1/using_doctr/custom_models_training.html b/v0.1.1/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/v0.1.1/using_doctr/custom_models_training.html +++ b/v0.1.1/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/v0.1.1/using_doctr/running_on_aws.html b/v0.1.1/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/v0.1.1/using_doctr/running_on_aws.html +++ b/v0.1.1/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/v0.1.1/using_doctr/sharing_models.html b/v0.1.1/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/v0.1.1/using_doctr/sharing_models.html +++ b/v0.1.1/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/v0.1.1/using_doctr/using_contrib_modules.html b/v0.1.1/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.1.1/using_doctr/using_contrib_modules.html +++ b/v0.1.1/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.1.1/using_doctr/using_datasets.html b/v0.1.1/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/v0.1.1/using_doctr/using_datasets.html +++ b/v0.1.1/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/v0.1.1/using_doctr/using_model_export.html b/v0.1.1/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/v0.1.1/using_doctr/using_model_export.html +++ b/v0.1.1/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/v0.1.1/using_doctr/using_models.html b/v0.1.1/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/v0.1.1/using_doctr/using_models.html +++ b/v0.1.1/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/v0.2.0/_modules/doctr/datasets/cord.html b/v0.2.0/_modules/doctr/datasets/cord.html index de8018d676..55b0584830 100644 --- a/v0.2.0/_modules/doctr/datasets/cord.html +++ b/v0.2.0/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.datasets.cord

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
-import tensorflow as tf
+from typing import Any, Dict, List, Tuple, Union
 
-from .core import VisionDataset
+import numpy as np
+from tqdm import tqdm
+
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['CORD']
+__all__ = ["CORD"]
 
 
 
-[docs] +[docs] class CORD(VisionDataset): """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" <https://openreview.net/pdf?id=SJl3z659UH>`_. - Example:: - >>> from doctr.datasets import CORD - >>> train_set = CORD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0 + :align: center + + >>> from doctr.datasets import CORD + >>> train_set = CORD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip', - '45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_test.zip', - '8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0", + "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8", + "cord_train.zip", + ) + + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0", + "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58", + "cord_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: - - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - - # # List images - self.root = os.path.join(self._root, 'image') - self.data: List[Tuple[str, Dict[str, Any]]] = [] + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) + + # List images + tmp_root = os.path.join(self.root, "image") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] self.train = train - self.sample_transforms = (lambda x: x) if sample_transforms is None else sample_transforms - for img_path in os.listdir(self.root): + np_dtype = np.float32 + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))): + # File existence check + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem _targets = [] - with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f: label = json.load(f) for line in label["valid_line"]: for word in line["words"]: - x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] - y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] - # Reduce 8 coords to 4 - left, right = min(x), max(x) - top, bot = min(y), max(y) if len(word["text"]) > 0: - _targets.append((word["text"], [left, top, right, bot])) + x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] + y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] + box: Union[List[float], np.ndarray] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + box = np.array( + [ + [x[0], y[0]], + [x[1], y[1]], + [x[2], y[2]], + [x[3], y[3]], + ], + dtype=np_dtype, + ) + else: + # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax + box = [min(x), min(y), max(x), max(y)] + _targets.append((word["text"], box)) text_targets, box_targets = zip(*_targets) - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.int), labels=text_targets))) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) + ) + for crop, label in zip(crops, list(text_targets)): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: - return f"train={self.train}" - - def __getitem__(self, index: int) -> Tuple[tf.Tensor, Dict[str, Any]]: - img_name, target = self.data[index] - # Read image - img = tf.io.read_file(os.path.join(self.root, img_name)) - img = tf.image.decode_jpeg(img, channels=3) - img = self.sample_transforms(img) - - return img, target - - @staticmethod - def collate_fn(samples: List[Tuple[tf.Tensor, Dict[str, Any]]]) -> Tuple[tf.Tensor, List[Dict[str, Any]]]: - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) - - return images, list(targets)
+ return f"train={self.train}"
@@ -394,8 +461,8 @@

Source code for doctr.datasets.cord

       
     
   
- - + + diff --git a/v0.2.0/_modules/doctr/datasets/core.html b/v0.2.0/_modules/doctr/datasets/core.html deleted file mode 100644 index a1d2ee62ad..0000000000 --- a/v0.2.0/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,392 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, index: int) -> Any:
-        raise NotImplementedError
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/_modules/doctr/datasets/detection.html b/v0.2.0/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/v0.2.0/_modules/doctr/datasets/detection.html +++ b/v0.2.0/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/doc_artefacts.html b/v0.2.0/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/v0.2.0/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.2.0/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.2.0/_modules/doctr/datasets/funsd.html b/v0.2.0/_modules/doctr/datasets/funsd.html index f536b9282c..f08612f9fa 100644 --- a/v0.2.0/_modules/doctr/datasets/funsd.html +++ b/v0.2.0/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.datasets.funsd

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
-import tensorflow as tf
+from typing import Any, Dict, List, Tuple, Union
 
-from .core import VisionDataset
+import numpy as np
+from tqdm import tqdm
+
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['FUNSD']
+__all__ = ["FUNSD"]
 
 
 
-[docs] +[docs] class FUNSD(VisionDataset): """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. - Example:: - >>> from doctr.datasets import FUNSD - >>> train_set = FUNSD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0 + :align: center + + >>> from doctr.datasets import FUNSD + >>> train_set = FUNSD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip' - SHA256 = 'c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f' - FILE_NAME = 'funsd.zip' + URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip" + SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f" + FILE_NAME = "funsd.zip" def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + super().__init__( + self.URL, + self.FILE_NAME, + self.SHA256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - super().__init__(self.URL, self.FILE_NAME, self.SHA256, True, **kwargs) self.train = train - self.sample_transforms = (lambda x: x) if sample_transforms is None else sample_transforms + np_dtype = np.float32 # Use the subset - subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data') + subfolder = os.path.join("dataset", "training_data" if train else "testing_data") # # List images - self.root = os.path.join(self._root, subfolder, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + tmp_root = os.path.join(self.root, subfolder, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))): + # File existence check + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f: data = json.load(f) - _targets = [(word['text'], word['box']) for block in data['form'] - for word in block['words'] if len(word['text']) > 0] - + _targets = [ + (word["text"], word["box"]) + for block in data["form"] + for word in block["words"] + if len(word["text"]) > 0 + ] text_targets, box_targets = zip(*_targets) - - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.int), labels=text_targets))) + if use_polygons: + # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners + box_targets = [ # type: ignore[assignment] + [ + [box[0], box[1]], + [box[2], box[1]], + [box[2], box[3]], + [box[0], box[3]], + ] + for box in box_targets + ] + + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype) + ) + for crop, label in zip(crops, list(text_targets)): + # filter labels with unknown characters + if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: - return f"train={self.train}" - - def __getitem__(self, index: int) -> Tuple[tf.Tensor, Dict[str, Any]]: - img_name, target = self.data[index] - # Read image - img = tf.io.read_file(os.path.join(self.root, img_name)) - img = tf.image.decode_jpeg(img, channels=3) - img = self.sample_transforms(img) - - return img, target - - @staticmethod - def collate_fn(samples: List[Tuple[tf.Tensor, Dict[str, Any]]]) -> Tuple[tf.Tensor, List[Dict[str, Any]]]: - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) - - return images, list(targets)
+ return f"train={self.train}"
@@ -388,8 +453,8 @@

Source code for doctr.datasets.funsd

       
     
   
- - + + diff --git a/v0.2.0/_modules/doctr/datasets/generator/tensorflow.html b/v0.2.0/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/v0.2.0/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.2.0/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/datasets/ic03.html b/v0.2.0/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/v0.2.0/_modules/doctr/datasets/ic03.html +++ b/v0.2.0/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/ic13.html b/v0.2.0/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/v0.2.0/_modules/doctr/datasets/ic13.html +++ b/v0.2.0/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/iiit5k.html b/v0.2.0/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/v0.2.0/_modules/doctr/datasets/iiit5k.html +++ b/v0.2.0/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/iiithws.html b/v0.2.0/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/v0.2.0/_modules/doctr/datasets/iiithws.html +++ b/v0.2.0/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/imgur5k.html b/v0.2.0/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/v0.2.0/_modules/doctr/datasets/imgur5k.html +++ b/v0.2.0/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/loader.html b/v0.2.0/_modules/doctr/datasets/loader.html index 5108e3b731..ed80350ef0 100644 --- a/v0.2.0/_modules/doctr/datasets/loader.html +++ b/v0.2.0/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.datasets.loader

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import math
-import tensorflow as tf
-import numpy as np
-from typing import List, Tuple, Dict, Any, Optional
+from typing import Callable, Optional
 
-from .multithreading import multithread_exec
+import numpy as np
+import tensorflow as tf
 
 __all__ = ["DataLoader"]
 
@@ -288,12 +314,13 @@ 

Source code for doctr.datasets.loader

     """Collate multiple elements into batches
 
     Args:
+    ----
         samples: list of N tuples containing M elements
 
     Returns:
+    -------
         Tuple of M sequences contianing N elements each
     """
-
     batch_data = zip(*samples)
 
     tf_data = tuple(tf.stack(elt, axis=0) for elt in batch_data)
@@ -302,23 +329,23 @@ 

Source code for doctr.datasets.loader

 
 
 
-[docs] +[docs] class DataLoader: """Implements a dataset wrapper for fast data loading - Example:: - >>> from doctr.datasets import FUNSD, DataLoader - >>> train_set = CORD(train=True, download=True) - >>> train_loader = DataLoader(train_set, batch_size=32) - >>> train_iter = iter(train_loader) - >>> images, targets = next(train_iter) + >>> from doctr.datasets import CORD, DataLoader + >>> train_set = CORD(train=True, download=True) + >>> train_loader = DataLoader(train_set, batch_size=32) + >>> train_iter = iter(train_loader) + >>> images, targets = next(train_iter) Args: + ---- dataset: the dataset shuffle: whether the samples should be shuffled before passing it to the iterator batch_size: number of elements in each batch drop_last: if `True`, drops the last batch if it isn't full - workers: number of workers to use for data loading + collate_fn: function to merge samples into a batch """ def __init__( @@ -327,17 +354,22 @@

Source code for doctr.datasets.loader

         shuffle: bool = True,
         batch_size: int = 1,
         drop_last: bool = False,
-        workers: Optional[int] = None,
+        collate_fn: Optional[Callable] = None,
     ) -> None:
         self.dataset = dataset
         self.shuffle = shuffle
         self.batch_size = batch_size
         nb = len(self.dataset) / batch_size
         self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
-        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
-        self.workers = workers
+        if collate_fn is None:
+            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
+        else:
+            self.collate_fn = collate_fn
         self.reset()
 
+    def __len__(self) -> int:
+        return self.num_batches
+
     def reset(self) -> None:
         # Updates indices after each epoch
         self._num_yielded = 0
@@ -353,9 +385,9 @@ 

Source code for doctr.datasets.loader

         if self._num_yielded < self.num_batches:
             # Get next indices
             idx = self._num_yielded * self.batch_size
-            indices = self.indices[idx: min(len(self.dataset), idx + self.batch_size)]
+            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
 
-            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.workers)
+            samples = list(map(self.dataset.__getitem__, indices))
 
             batch_data = self.collate_fn(samples)
 
@@ -396,8 +428,8 @@ 

Source code for doctr.datasets.loader

       
     
   
-
- +
+ diff --git a/v0.2.0/_modules/doctr/datasets/mjsynth.html b/v0.2.0/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/v0.2.0/_modules/doctr/datasets/mjsynth.html +++ b/v0.2.0/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/ocr.html b/v0.2.0/_modules/doctr/datasets/ocr.html index 5832933ea5..ce1ed8b0d4 100644 --- a/v0.2.0/_modules/doctr/datasets/ocr.html +++ b/v0.2.0/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -403,7 +403,7 @@

Source code for doctr.datasets.ocr

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/recognition.html b/v0.2.0/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/v0.2.0/_modules/doctr/datasets/recognition.html +++ b/v0.2.0/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/sroie.html b/v0.2.0/_modules/doctr/datasets/sroie.html index 97f29ccdda..04cf10bda2 100644 --- a/v0.2.0/_modules/doctr/datasets/sroie.html +++ b/v0.2.0/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.datasets.sroie

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import csv
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
-import tensorflow as tf
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
-from doctr.documents.reader import read_img
-from .core import VisionDataset
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['SROIE']
+__all__ = ["SROIE"]
 
 
 
-[docs] +[docs] class SROIE(VisionDataset): """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction" <https://arxiv.org/pdf/2103.10213.pdf>`_. - Example:: - >>> from doctr.datasets import SROIE - >>> train_set = SROIE(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0 + :align: center + + >>> from doctr.datasets import SROIE + >>> train_set = SROIE(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_train_task1.zip', - 'd4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_test.zip', - '41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0", + "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f", + "sroie2019_train_task1.zip", + ) + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0", + "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2", + "sroie2019_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - self.sample_transforms = (lambda x: x) if sample_transforms is None else sample_transforms self.train = train - # # List images - self.root = os.path.join(self._root, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): - stem = Path(img_path).stem - _targets = [] - with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f: - for row in csv.reader(f, delimiter=','): - # Safeguard for blank lines - if len(row) > 0: - # Label may contain commas - label = ",".join(row[8:]) - # Reduce 8 coords to 4 - p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x, p4_y = map(int, row[:8]) - left, right = min(p1_x, p2_x, p3_x, p4_x), max(p1_x, p2_x, p3_x, p4_x) - top, bot = min(p1_y, p2_y, p3_y, p4_y), max(p1_y, p2_y, p3_y, p4_y) - if len(label) > 0: - _targets.append((label, [left, top, right, bot])) - - text_targets, box_targets = zip(*_targets) - - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets))) + tmp_root = os.path.join(self.root, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + np_dtype = np.float32 - def extra_repr(self) -> str: - return f"train={self.train}" + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))): + # File existence check + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") - def __getitem__(self, index: int) -> Tuple[tf.Tensor, Dict[str, Any]]: - img_name, target = self.data[index] - # Read image - img = tf.io.read_file(os.path.join(self.root, img_name)) - img = tf.image.decode_jpeg(img, channels=3) - img = self.sample_transforms(img) - - return img, target - - @staticmethod - def collate_fn(samples: List[Tuple[tf.Tensor, Dict[str, Any]]]) -> Tuple[tf.Tensor, List[Dict[str, Any]]]: - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) + stem = Path(img_path).stem + with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f: + _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0] + + labels = [",".join(row[8:]) for row in _rows] + # reorder coordinates (8 -> (4,2) -> + # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines + coords: np.ndarray = np.stack( + [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0 + ) + + if not use_polygons: + # xmin, ymin, xmax, ymax + coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1) + + if recognition_task: + crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords) + for crop, label in zip(crops, labels): + if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0: + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, coords)) + else: + self.data.append((img_path, dict(boxes=coords, labels=labels))) + + self.root = tmp_root - return images, list(targets)
+ def extra_repr(self) -> str: + return f"train={self.train}"
@@ -396,8 +444,8 @@

Source code for doctr.datasets.sroie

       
     
   
- - + + diff --git a/v0.2.0/_modules/doctr/datasets/svhn.html b/v0.2.0/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/v0.2.0/_modules/doctr/datasets/svhn.html +++ b/v0.2.0/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/svt.html b/v0.2.0/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/v0.2.0/_modules/doctr/datasets/svt.html +++ b/v0.2.0/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/synthtext.html b/v0.2.0/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/v0.2.0/_modules/doctr/datasets/synthtext.html +++ b/v0.2.0/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.2.0/_modules/doctr/datasets/utils.html b/v0.2.0/_modules/doctr/datasets/utils.html index aedf276e89..bde9304597 100644 --- a/v0.2.0/_modules/doctr/datasets/utils.html +++ b/v0.2.0/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.datasets.utils

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import string
 import unicodedata
+from collections.abc import Sequence
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Sequence as SequenceType
+
 import numpy as np
-from typing import List, Optional, Any
+from PIL import Image
+
+from doctr.io.image import get_img_shape
+from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
 
 from .vocabs import VOCABS
 
-__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
+__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences", "pre_transform_multiclass"]
+
+ImageTensor = TypeVar("ImageTensor")
 
 
 def translate(
     input_string: str,
     vocab_name: str,
-    unknown_char: str = '■',
+    unknown_char: str = "■",
 ) -> str:
     """Translate a string input in a given vocabulary
 
     Args:
+    ----
         input_string: input string to translate
         vocab_name: vocabulary to use (french, latin, ...)
         unknown_char: unknown character for non-translatable characters
 
     Returns:
-        A string translated in a given vocab"""
-
+    -------
+        A string translated in a given vocab
+    """
     if VOCABS.get(vocab_name) is None:
         raise KeyError("output vocabulary must be in vocabs dictionnary")
 
-    translated = ''
+    translated = ""
     for char in input_string:
         if char not in VOCABS[vocab_name]:
             # we need to translate char into a vocab char
@@ -310,85 +350,177 @@ 

Source code for doctr.datasets.utils

                 # remove whitespaces
                 continue
             # normalize character if it is not in vocab
-            char = unicodedata.normalize('NFD', char).encode('ascii', 'ignore').decode('ascii')
-            if char == '' or char not in VOCABS[vocab_name]:
+            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
+            if char == "" or char not in VOCABS[vocab_name]:
                 # if normalization fails or char still not in vocab, return unknown character)
                 char = unknown_char
         translated += char
     return translated
 
 
-def encode_sequence(
+def encode_string(
     input_string: str,
     vocab: str,
-) -> List[str]:
+) -> List[int]:
     """Given a predefined mapping, encode the string to a sequence of numbers
 
     Args:
+    ----
         input_string: string to encode
         vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A list encoding the input_string"""
-
-    return list(map(vocab.index, input_string))
+    -------
+        A list encoding the input_string
+    """
+    try:
+        return list(map(vocab.index, input_string))
+    except ValueError:
+        raise ValueError(
+            f"some characters cannot be found in 'vocab'. \
+                         Please check the input string {input_string} and the vocabulary {vocab}"
+        )
 
 
 def decode_sequence(
-    input_array: np.array,
+    input_seq: Union[np.ndarray, SequenceType[int]],
     mapping: str,
 ) -> str:
     """Given a predefined mapping, decode the sequence of numbers to a string
 
     Args:
-        input_array: array to decode
+    ----
+        input_seq: array to decode
         mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A string, decoded from input_array"""
-
-    if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
+    -------
+        A string, decoded from input_seq
+    """
+    if not isinstance(input_seq, (Sequence, np.ndarray)):
+        raise TypeError("Invalid sequence type")
+    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
         raise AssertionError("Input must be an array of int, with max less than mapping size")
-    decoded = ''.join(mapping[idx] for idx in input_array)
-    return decoded
+
+    return "".join(map(mapping.__getitem__, input_seq))
 
 
 
-[docs] +[docs] def encode_sequences( sequences: List[str], vocab: str, target_size: Optional[int] = None, eos: int = -1, - **kwargs: Any, + sos: Optional[int] = None, + pad: Optional[int] = None, + dynamic_seq_length: bool = False, ) -> np.ndarray: """Encode character sequences using a given vocab as mapping Args: + ---- sequences: the list of character sequences of size N vocab: the ordered vocab to use for encoding target_size: maximum length of the encoded data eos: encoding of End Of String + sos: optional encoding of Start Of String + pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD + dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size Returns: + ------- the padded encoded data as a tensor """ - if 0 <= eos < len(vocab): raise ValueError("argument 'eos' needs to be outside of vocab possible indices") - if not isinstance(target_size, int): - target_size = max(len(w) for w in sequences) + if not isinstance(target_size, int) or dynamic_seq_length: + # Maximum string length + EOS + max_length = max(len(w) for w in sequences) + 1 + if isinstance(sos, int): + max_length += 1 + if isinstance(pad, int): + max_length += 1 + target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size) # Pad all sequences - encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32) - - for idx, seq in enumerate(sequences): - encoded_seq = encode_sequence(seq, vocab) - encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)] + if isinstance(pad, int): # pad with padding symbol + if 0 <= pad < len(vocab): + raise ValueError("argument 'pad' needs to be outside of vocab possible indices") + # In that case, add EOS at the end of the word before padding + default_symbol = pad + else: # pad with eos symbol + default_symbol = eos + encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32) + + # Encode the strings + for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)): + if isinstance(pad, int): # add eos at the end of the sequence + seq.append(eos) + encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)] + + if isinstance(sos, int): # place sos symbol at the beginning of each sequence + if 0 <= sos < len(vocab): + raise ValueError("argument 'sos' needs to be outside of vocab possible indices") + encoded_data = np.roll(encoded_data, 1) + encoded_data[:, 0] = sos return encoded_data
+ + +def convert_target_to_relative( + img: ImageTensor, target: Union[np.ndarray, Dict[str, Any]] +) -> Tuple[ImageTensor, Union[Dict[str, Any], np.ndarray]]: + if isinstance(target, np.ndarray): + target = convert_to_relative_coords(target, get_img_shape(img)) + else: + target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img)) + return img, target + + +def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]: + """Crop a set of bounding boxes from an image + + Args: + ---- + img_path: path to the image + geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4) + + Returns: + ------- + a list of cropped images + """ + with Image.open(img_path) as pil_img: + img: np.ndarray = np.array(pil_img.convert("RGB")) + # Polygon + if geoms.ndim == 3 and geoms.shape[1:] == (4, 2): + return extract_rcrops(img, geoms.astype(dtype=int)) + if geoms.ndim == 2 and geoms.shape[1] == 4: + return extract_crops(img, geoms.astype(dtype=int)) + raise ValueError("Invalid geometry format") + + +def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.ndarray, Dict[str, List]]: + """Converts multiclass target to relative coordinates. + + Args: + ---- + img: Image + target: tuple of target polygons and their classes names + + Returns: + ------- + Image and dictionary of boxes, with class names as keys + """ + boxes = convert_to_relative_coords(target[0], get_img_shape(img)) + boxes_classes = target[1] + boxes_dict: Dict = {k: [] for k in sorted(set(boxes_classes))} + for k, poly in zip(boxes_classes, boxes): + boxes_dict[k].append(poly) + boxes_dict = {k: np.stack(v, axis=0) for k, v in boxes_dict.items()} + return img, boxes_dict
@@ -421,8 +553,8 @@

Source code for doctr.datasets.utils

       
     
   
- - + + diff --git a/v0.2.0/_modules/doctr/datasets/wildreceipt.html b/v0.2.0/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.2.0/_modules/doctr/datasets/wildreceipt.html +++ b/v0.2.0/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.2.0/_modules/doctr/documents/elements.html b/v0.2.0/_modules/doctr/documents/elements.html deleted file mode 100644 index df3a989d4a..0000000000 --- a/v0.2.0/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,550 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional
-
-from doctr.utils.geometry import resolve_enclosing_bbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[BoundingBox] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - geometry = resolve_enclosing_bbox([w.geometry for w in words]) - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[BoundingBox] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - geometry = resolve_enclosing_bbox(line_boxes + artefact_boxes) - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - - def show(self, page: np.ndarray, interactive: bool = True, **kwargs) -> None: - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Plot the results""" - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/_modules/doctr/documents/reader.html b/v0.2.0/_modules/doctr/documents/reader.html deleted file mode 100644 index 43865531a4..0000000000 --- a/v0.2.0/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,606 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/_modules/doctr/io/elements.html b/v0.2.0/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/v0.2.0/_modules/doctr/io/elements.html +++ b/v0.2.0/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.2.0/_modules/doctr/io/html.html b/v0.2.0/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/v0.2.0/_modules/doctr/io/html.html +++ b/v0.2.0/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.2.0/_modules/doctr/io/image/base.html b/v0.2.0/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/v0.2.0/_modules/doctr/io/image/base.html +++ b/v0.2.0/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.2.0/_modules/doctr/io/image/tensorflow.html b/v0.2.0/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/v0.2.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.2.0/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.2.0/_modules/doctr/io/pdf.html b/v0.2.0/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/v0.2.0/_modules/doctr/io/pdf.html +++ b/v0.2.0/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.2.0/_modules/doctr/io/reader.html b/v0.2.0/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/v0.2.0/_modules/doctr/io/reader.html +++ b/v0.2.0/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.2.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/v0.2.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/v0.2.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/v0.2.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.2.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/v0.2.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/v0.2.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/classification/zoo.html b/v0.2.0/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/v0.2.0/_modules/doctr/models/classification/zoo.html +++ b/v0.2.0/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.2.0/_modules/doctr/models/detection/differentiable_binarization.html b/v0.2.0/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index aef0023c40..0000000000 --- a/v0.2.0/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,876 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/db_resnet50-98ba765d.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            epsilon = 0.01 * cv2.arcLength(contour, True)
-            approx = cv2.approxPolyDP(contour, epsilon, True)  # approximate contour by a polygon
-            points = approx.reshape((-1, 2))  # get polygon points
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.2.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 4325d0b74a..66cef8663d 100644 --- a/v0.2.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -759,7 +759,7 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo

- + diff --git a/v0.2.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.2.0/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.2.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/detection/linknet.html b/v0.2.0/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 42db111bb3..0000000000 --- a/v0.2.0/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,637 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.2.0/_modules/doctr/models/detection/linknet/tensorflow.html index dbb58e37cf..ce995f99d4 100644 --- a/v0.2.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -716,7 +716,7 @@

Source code for doctr.models.detection.linknet.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/detection/zoo.html b/v0.2.0/_modules/doctr/models/detection/zoo.html index 55630ebacb..3651c4e2d3 100644 --- a/v0.2.0/_modules/doctr/models/detection/zoo.html +++ b/v0.2.0/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.models.detection.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Dict, Any
-from .core import DetectionPredictor, DetectionPreProcessor
-from .. import detection
+from typing import Any, List
+
+from doctr.file_utils import is_tf_available, is_torch_available
 
+from .. import detection
+from ..detection.fast import reparameterize
+from ..preprocessor import PreProcessor
+from .predictor import DetectionPredictor
 
 __all__ = ["detection_predictor"]
 
-ARCHS = ['db_resnet50', 'linknet']
+ARCHS: List[str]
+
 
+if is_tf_available():
+    ARCHS = [
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
+elif is_torch_available():
+    ARCHS = [
+        "db_resnet34",
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+def _predictor(arch: Any, pretrained: bool, assume_straight_pages: bool = True, **kwargs: Any) -> DetectionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-    # Detection
-    _model = detection.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
+        _model = detection.__dict__[arch](
+            pretrained=pretrained,
+            pretrained_backbone=kwargs.get("pretrained_backbone", True),
+            assume_straight_pages=assume_straight_pages,
+        )
+        # Reparameterize FAST models by default to lower inference latency and memory usage
+        if isinstance(_model, detection.FAST):
+            _model = reparameterize(_model)
+    else:
+        if not isinstance(arch, (detection.DBNet, detection.LinkNet, detection.FAST)):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+
+        _model = arch
+        _model.assume_straight_pages = assume_straight_pages
+        _model.postprocessor.assume_straight_pages = assume_straight_pages
+
+    kwargs.pop("pretrained_backbone", None)
+
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 2)
     predictor = DetectionPredictor(
-        DetectionPreProcessor(output_size=_model.cfg['input_shape'][:2], **kwargs),
-        _model
+        PreProcessor(_model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:], **kwargs),
+        _model,
     )
     return predictor
 
 
 
-[docs] -def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: +[docs] +def detection_predictor( + arch: Any = "fast_base", + pretrained: bool = False, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + batch_size: int = 2, + **kwargs: Any, +) -> DetectionPredictor: """Text detection architecture. - Example:: - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> model = detection_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import detection_predictor + >>> model = detection_predictor(arch='db_resnet50', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_resnet50') + ---- + arch: name of the architecture or model itself to use (e.g. 'db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset + assume_straight_pages: If True, fit straight boxes to the page + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right + batch_size: number of samples the model processes in parallel + **kwargs: optional keyword arguments passed to the architecture Returns: + ------- Detection predictor """ - - return _predictor(arch, pretrained, **kwargs)
+ return _predictor( + arch=arch, + pretrained=pretrained, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + batch_size=batch_size, + **kwargs, + )
@@ -354,8 +449,8 @@

Source code for doctr.models.detection.zoo

       
     
   
- - + + diff --git a/v0.2.0/_modules/doctr/models/export.html b/v0.2.0/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.2.0/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/_modules/doctr/models/factory/hub.html b/v0.2.0/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/v0.2.0/_modules/doctr/models/factory/hub.html +++ b/v0.2.0/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.2.0/_modules/doctr/models/recognition/crnn.html b/v0.2.0/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index db8bbc2c27..0000000000 --- a/v0.2.0/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,579 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def ctc_decoder(
-        self,
-        logits: tf.Tensor
-    ) -> tf.Tensor:
-        """
-        Decode logits with CTC decoder from keras backend
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            decoded logits, shape BATCH_SIZE X SEQ_LEN
-        """
-        # computing prediction with ctc decoder
-        _prediction = tf.nn.ctc_greedy_decoder(
-            tf.nn.softmax(tf.transpose(logits, perm=[1, 0, 2])),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            merge_repeated=True
-        )[0][0]
-        prediction = tf.sparse.to_dense(_prediction, default_value=len(self.vocab))
-
-        return prediction
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[str]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # decode ctc for ctc models
-        predictions = self.ctc_decoder(logits)
-
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, predictions),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        words_list = [word.decode() for word in list(decoded_strings_pred.numpy())]
-
-        if self.ignore_case:
-            words_list = [word.lower() for word in words_list]
-
-        if self.ignore_accents:
-            raise NotImplementedError
-
-        return words_list
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, tf.Tensor]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        decoded_features = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.2.0/_modules/doctr/models/recognition/crnn/tensorflow.html index e50c245923..bc64da9a1b 100644 --- a/v0.2.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -658,7 +658,7 @@

Source code for doctr.models.recognition.crnn.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.2.0/_modules/doctr/models/recognition/master/tensorflow.html index 152ebb7e59..aa6aa69325 100644 --- a/v0.2.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -655,7 +655,7 @@

Source code for doctr.models.recognition.master.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.2.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/v0.2.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/recognition/sar.html b/v0.2.0/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 7b3a3e74b1..0000000000 --- a/v0.2.0/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,709 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, tf.Tensor]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[str]:
-        # compute pred with argmax for attention models
-        pred = tf.math.argmax(logits, axis=2)
-
-        # decode raw output of the model with tf_label_to_idx
-        pred = tf.cast(pred, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, pred), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        words_list = [word.decode() for word in list(decoded_strings_pred.numpy())]
-
-        if self.ignore_case:
-            words_list = [word.lower() for word in words_list]
-
-        if self.ignore_accents:
-            raise NotImplementedError
-
-        return words_list
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.2.0/_modules/doctr/models/recognition/sar/tensorflow.html index 010bc2bc54..4a591e6451 100644 --- a/v0.2.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -757,7 +757,7 @@

Source code for doctr.models.recognition.sar.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.2.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/v0.2.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/models/recognition/zoo.html b/v0.2.0/_modules/doctr/models/recognition/zoo.html index a4d43d1801..f664304019 100644 --- a/v0.2.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.2.0/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.models.recognition.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Dict, Any
-from .core import RecognitionPredictor, RecognitionPreProcessor
-from .. import recognition
+from typing import Any, List
 
+from doctr.file_utils import is_tf_available
+from doctr.models.preprocessor import PreProcessor
+
+from .. import recognition
+from .predictor import RecognitionPredictor
 
 __all__ = ["recognition_predictor"]
 
-ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
 
+ARCHS: List[str] = [
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "sar_resnet31",
+    "master",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
+]
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-    _model = recognition.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    predictor = RecognitionPredictor(
-        RecognitionPreProcessor(output_size=_model.cfg['input_shape'][:2], **kwargs),
-        _model
-    )
+        _model = recognition.__dict__[arch](
+            pretrained=pretrained, pretrained_backbone=kwargs.get("pretrained_backbone", True)
+        )
+    else:
+        if not isinstance(
+            arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq)
+        ):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+        _model = arch
+
+    kwargs.pop("pretrained_backbone", None)
+
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 128)
+    input_shape = _model.cfg["input_shape"][:2] if is_tf_available() else _model.cfg["input_shape"][-2:]
+    predictor = RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs), _model)
 
     return predictor
 
 
 
-[docs] -def recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor: +[docs] +def recognition_predictor( + arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + symmetric_pad: bool = False, + batch_size: int = 128, + **kwargs: Any, +) -> RecognitionPredictor: """Text recognition architecture. Example:: @@ -313,14 +369,18 @@

Source code for doctr.models.recognition.zoo

        >>> out = model([input_page])
 
     Args:
-        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
+    ----
+        arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
         pretrained: If True, returns a model pre-trained on our text recognition dataset
+        symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right
+        batch_size: number of samples the model processes in parallel
+        **kwargs: optional parameters to be passed to the architecture
 
     Returns:
+    -------
         Recognition predictor
     """
-
-    return _predictor(arch, pretrained, **kwargs)
+ return _predictor(arch=arch, pretrained=pretrained, symmetric_pad=symmetric_pad, batch_size=batch_size, **kwargs)
@@ -354,8 +414,8 @@

Source code for doctr.models.recognition.zoo

   
-
- +
+ diff --git a/v0.2.0/_modules/doctr/models/zoo.html b/v0.2.0/_modules/doctr/models/zoo.html index dec6857019..d459671648 100644 --- a/v0.2.0/_modules/doctr/models/zoo.html +++ b/v0.2.0/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.models.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from typing import Any
-from .core import OCRPredictor
+
 from .detection.zoo import detection_predictor
+from .kie_predictor import KIEPredictor
+from .predictor import OCRPredictor
 from .recognition.zoo import recognition_predictor
 
+__all__ = ["ocr_predictor", "kie_predictor"]
 
-__all__ = ["ocr_predictor"]
-
-
-def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
 
+def _predictor(
+    det_arch: Any,
+    reco_arch: Any,
+    pretrained: bool,
+    pretrained_backbone: bool = True,
+    assume_straight_pages: bool = True,
+    preserve_aspect_ratio: bool = True,
+    symmetric_pad: bool = True,
+    det_bs: int = 2,
+    reco_bs: int = 128,
+    detect_orientation: bool = False,
+    straighten_pages: bool = False,
+    detect_language: bool = False,
+    **kwargs,
+) -> OCRPredictor:
     # Detection
-    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
+    det_predictor = detection_predictor(
+        det_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=det_bs,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+    )
 
     # Recognition
-    reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
+    reco_predictor = recognition_predictor(
+        reco_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=reco_bs,
+    )
 
-    return OCRPredictor(det_predictor, reco_predictor)
+    return OCRPredictor(
+        det_predictor,
+        reco_predictor,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+        detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
+        detect_language=detect_language,
+        **kwargs,
+    )
 
 
 
-[docs] +[docs] def ocr_predictor( - det_arch: str = 'db_resnet50', - reco_arch: str = 'crnn_vgg16_bn', + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", pretrained: bool = False, - **kwargs: Any + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. - Example:: - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet') + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` Returns: + ------- OCR predictor """ + return _predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
+ + - return _predictor(det_arch, reco_arch, pretrained, **kwargs)
+def _kie_predictor( + det_arch: Any, + reco_arch: Any, + pretrained: bool, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + det_bs: int = 2, + reco_bs: int = 128, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs, +) -> KIEPredictor: + # Detection + det_predictor = detection_predictor( + det_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=det_bs, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + ) + + # Recognition + reco_predictor = recognition_predictor( + reco_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=reco_bs, + ) + + return KIEPredictor( + det_predictor, + reco_predictor, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + ) + + +
+[docs] +def kie_predictor( + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, +) -> KIEPredictor: + """End-to-end KIE architecture using one model for localization, and another for text recognition. + + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) + + Args: + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') + pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` + + Returns: + ------- + KIE predictor + """ + return _kie_predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )

@@ -353,8 +575,8 @@

Source code for doctr.models.zoo

       
     
   
- - + + diff --git a/v0.2.0/_modules/doctr/transforms/modules.html b/v0.2.0/_modules/doctr/transforms/modules.html deleted file mode 100644 index 214233e166..0000000000 --- a/v0.2.0/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,716 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - - def extra_repr(self) -> str: - return f"output_size={self.output_size}, method='{self.method}'" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - img = tf.image.pad_to_bounding_box(img, 0, 0, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/_modules/doctr/transforms/modules/base.html b/v0.2.0/_modules/doctr/transforms/modules/base.html index 96ebd680b7..4596df3848 100644 --- a/v0.2.0/_modules/doctr/transforms/modules/base.html +++ b/v0.2.0/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -643,7 +643,7 @@

Source code for doctr.transforms.modules.base

- + diff --git a/v0.2.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.2.0/_modules/doctr/transforms/modules/tensorflow.html index 0e18bcc922..acbbe96225 100644 --- a/v0.2.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.2.0/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -956,7 +956,7 @@

Source code for doctr.transforms.modules.tensorflow

- + diff --git a/v0.2.0/_modules/doctr/utils/metrics.html b/v0.2.0/_modules/doctr/utils/metrics.html index afd16328c6..8a37d5949a 100644 --- a/v0.2.0/_modules/doctr/utils/metrics.html +++ b/v0.2.0/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.utils.metrics

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-from rapidfuzz.string_metric import levenshtein
-from typing import List, Tuple
+from anyascii import anyascii
 from scipy.optimize import linear_sum_assignment
+from shapely.geometry import Polygon
 
-__all__ = ['ExactMatch', 'box_iou', 'assign_pairs', 'LocalizationConfusion', 'OCRMetric']
+__all__ = [
+    "TextMatch",
+    "box_iou",
+    "polygon_iou",
+    "nms",
+    "LocalizationConfusion",
+    "OCRMetric",
+    "DetectionMetric",
+]
 
 
-
-[docs] -class ExactMatch: - """Implements exact match metric (word-level accuracy) for recognition task. +def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]: + """Performs string comparison with multiple levels of tolerance - The aggregated metric is computed as follows: + Args: + ---- + word1: a string + word2: another string - .. math:: - \\forall X, Y \\in \\mathcal{W}^N, - ExactMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) + Returns: + ------- + a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their + anyascii counterparts and their lower-case anyascii counterparts match + """ + raw_match = word1 == word2 + caseless_match = word1.lower() == word2.lower() + anyascii_match = anyascii(word1) == anyascii(word2) - with the indicator function :math:`f_{a}` defined as: + # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched + unicase_match = anyascii(word1).lower() == anyascii(word2).lower() - .. math:: - \\forall a, x \\in \\mathcal{W}, - f_a(x) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } x = a \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{W}` is the set of all possible character sequences, - :math:`N` is a strictly positive integer. + return raw_match, caseless_match, anyascii_match, unicase_match - Example:: - >>> from doctr.utils import ExactMatch - >>> metric = ExactMatch() - >>> metric.update(['Hello', 'world'], ['hello', 'world']) - >>> metric.summary() - Args: - ignore_case: if true, ignore letter case when computing metric - ignore_accents: if true, ignore accents errors when computing metrics""" +
+[docs] +class TextMatch: + r"""Implements text match metric (word-level accuracy) for recognition task. - def __init__( - self, - ignore_case: bool = False, - ignore_accents: bool = False, - ) -> None: + The raw aggregated metric is computed as follows: - self.matches = 0 - self.total = 0 - self.ignore_case = ignore_case - self.ignore_accents = ignore_accents + .. math:: + \forall X, Y \in \mathcal{W}^N, + TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) - @staticmethod - def remove_accent(input_string: str) -> str: - """Removes all accents (¨^çéè...) from input_string + with the indicator function :math:`f_{a}` defined as: - Args: - input_string: character sequence with accents + .. math:: + \forall a, x \in \mathcal{W}, + f_a(x) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } x = a \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{W}` is the set of all possible character sequences, + :math:`N` is a strictly positive integer. - Returns: - character sequence without accents""" + >>> from doctr.utils import TextMatch + >>> metric = TextMatch() + >>> metric.update(['Hello', 'world'], ['hello', 'world']) + >>> metric.summary() + """ - raise NotImplementedError + def __init__(self) -> None: + self.reset() +
+[docs] def update( self, gt: List[str], @@ -348,53 +386,66 @@

Source code for doctr.utils.metrics

         """Update the state of the metric with new predictions
 
         Args:
+        ----
             gt: list of groung-truth character sequences
-            pred: list of predicted character sequences"""
-
+            pred: list of predicted character sequences
+        """
         if len(gt) != len(pred):
             raise AssertionError("prediction size does not match with ground-truth labels size")
 
-        for pred_word, gt_word in zip(pred, gt):
-            if self.ignore_accents:
-                gt_word = self.remove_accent(gt_word)
-                pred_word = self.remove_accent(pred_word)
-
-            if self.ignore_case:
-                gt_word = gt_word.lower()
-                pred_word = pred_word.lower()
+        for gt_word, pred_word in zip(gt, pred):
+            _raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
+            self.raw += int(_raw)
+            self.caseless += int(_caseless)
+            self.anyascii += int(_anyascii)
+            self.unicase += int(_unicase)
 
-            if pred_word == gt_word:
-                self.matches += 1
+        self.total += len(gt)
- self.total += len(gt) - def summary(self) -> float: - """Computes the aggregated evaluation +
+[docs] + def summary(self) -> Dict[str, float]: + """Computes the aggregated metrics - Returns: - metric result""" + Returns + ------- + a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii + counterpart and its lower-case anyascii counterpart + """ if self.total == 0: raise AssertionError("you need to update the metric before getting the summary") - return self.matches / self.total + + return dict( + raw=self.raw / self.total, + caseless=self.caseless / self.total, + anyascii=self.anyascii / self.total, + unicase=self.unicase / self.total, + )
+ def reset(self) -> None: - self.matches = 0 + self.raw = 0 + self.caseless = 0 + self.anyascii = 0 + self.unicase = 0 self.total = 0
def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray: - """Compute the IoU between two sets of bounding boxes + """Computes the IoU between two sets of bounding boxes Args: + ---- boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax) boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax) Returns: + ------- the IoU matrix of shape (N, M) """ - - iou_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) + iou_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0: l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1) @@ -405,169 +456,244 @@

Source code for doctr.utils.metrics

         right = np.minimum(r1, r2.T)
         bot = np.minimum(b1, b2.T)
 
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
+        intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
         union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
         iou_mat = intersection / union
 
     return iou_mat
 
 
-def assign_pairs(score_mat: np.ndarray, score_threshold: float = 0.5) -> Tuple[np.ndarray, np.ndarray]:
-    """Assigns candidates by maximizing the score of all pairs
+def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
+    """Computes the IoU between two sets of rotated bounding boxes
 
     Args:
-        score_mat: score matrix
-        score_threshold: minimum score to validate an assignment
+    ----
+        polys_1: rotated bounding boxes of shape (N, 4, 2)
+        polys_2: rotated bounding boxes of shape (M, 4, 2)
+        mask_shape: spatial shape of the intermediate masks
+        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
+
     Returns:
-        a tuple of two lists: the list of assigned row candidates indices, and the list of their column counterparts
+    -------
+        the IoU matrix of shape (N, M)
     """
+    if polys_1.ndim != 3 or polys_2.ndim != 3:
+        raise AssertionError("expects boxes to be in format (N, 4, 2)")
+
+    iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
+
+    shapely_polys_1 = [Polygon(poly) for poly in polys_1]
+    shapely_polys_2 = [Polygon(poly) for poly in polys_2]
+
+    for i, poly1 in enumerate(shapely_polys_1):
+        for j, poly2 in enumerate(shapely_polys_2):
+            intersection_area = poly1.intersection(poly2).area
+            union_area = poly1.area + poly2.area - intersection_area
+            iou_mat[i, j] = intersection_area / union_area
 
-    row_ind, col_ind = linear_sum_assignment(-score_mat)
-    is_kept = score_mat[row_ind, col_ind] >= score_threshold
-    return row_ind[is_kept], col_ind[is_kept]
+    return iou_mat
+
+
+def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
+    """Perform non-max suppression, borrowed from <https://github.com/rbgirshick/fast-rcnn>`_.
+
+    Args:
+    ----
+        boxes: np array of straight boxes: (*, 5), (xmin, ymin, xmax, ymax, score)
+        thresh: iou threshold to perform box suppression.
+
+    Returns:
+    -------
+        A list of box indexes to keep
+    """
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    scores = boxes[:, 4]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+    return keep
 
 
 
-[docs] +[docs] class LocalizationConfusion: - """Implements common confusion metrics and mean IoU for localization evaluation. + r"""Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ - Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) + \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ + Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ + Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ + meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \\forall y \\in \\mathcal{B}, - g_X(y) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, + \forall y \in \mathcal{B}, + g_X(y) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import LocalizationConfusion - >>> metric = LocalizationConfusion(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import LocalizationConfusion + >>> metric = LocalizationConfusion(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ - def __init__(self, iou_thresh: float = 0.5) -> None: - + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: self.iou_thresh = iou_thresh - self.num_gts = 0 - self.num_preds = 0 - self.num_matches = 0 - self.tot_iou = 0. + self.use_polygons = use_polygons + self.reset() +
+[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: + """Updates the metric + Args: + ---- + gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + """ if preds.shape[0] > 0: # Compute IoU - iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=1).sum()) + if self.use_polygons: + iou_mat = polygon_iou(gts, preds) + else: + iou_mat = box_iou(gts, preds) + self.tot_iou += float(iou_mat.max(axis=0).sum()) + # Assign pairs - gt_indices, _ = assign_pairs(iou_mat, self.iou_thresh) - self.num_matches += len(gt_indices) + gt_indices, pred_indices = linear_sum_assignment(-iou_mat) + self.matches += int((iou_mat[gt_indices, pred_indices] >= self.iou_thresh).sum()) # Update counts self.num_gts += gts.shape[0] - self.num_preds += preds.shape[0] + self.num_preds += preds.shape[0]
- def summary(self) -> Tuple[float, float, float]: +
+[docs] + def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: + """Computes the aggregated metrics + + Returns + ------- + a tuple with the recall, precision and meanIoU scores + """ # Recall - recall = self.num_matches / self.num_gts + recall = self.matches / self.num_gts if self.num_gts > 0 else None # Precision - precision = self.num_matches / self.num_preds + precision = self.matches / self.num_preds if self.num_preds > 0 else None # mean IoU - mean_iou = self.tot_iou / self.num_preds + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None + + return recall, precision, mean_iou
- return recall, precision, mean_iou def reset(self) -> None: self.num_gts = 0 self.num_preds = 0 - self.num_matches = 0 - self.tot_iou = 0.
+ self.matches = 0 + self.tot_iou = 0.0
-[docs] +[docs] class OCRMetric: - """Implements end-to-end OCR metric. + r"""Implements an end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, - \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ - Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) + \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, + \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ + Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, - h_{B,L}(b, l) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ - & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, - :math:`\\mathcal{L}` is the set of possible character sequences, + \forall (b, l) \in \mathcal{B} \times \mathcal{L}, + h_{B,L}(b, l) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import OCRMetric - >>> metric = OCRMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - ['hello'], ['hello', 'world']) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import OCRMetric + >>> metric = OCRMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> ['hello'], ['hello', 'world']) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match - max_dist: maximum Levenshtein distance between 2 sequence to consider a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - max_dist: int = 0 + use_polygons: bool = False, ) -> None: - self.iou_thresh = iou_thresh - self.max_dist = max_dist - self.num_gts = 0 - self.num_preds = 0 - self.num_det_matches = 0 - self.num_reco_matches = 0 - self.tot_iou = 0. - self.tot_dist = 0 + self.use_polygons = use_polygons + self.reset() +
+[docs] def update( self, gt_boxes: np.ndarray, @@ -575,52 +701,207 @@

Source code for doctr.utils.metrics

         gt_labels: List[str],
         pred_labels: List[str],
     ) -> None:
+        """Updates the metric
+
+        Args:
+        ----
+            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
+            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
+            gt_labels: a list of N string labels
+            pred_labels: a list of M string labels
+        """
+        if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
+            raise AssertionError(
+                "there should be the same number of boxes and string both for the ground truth and the predictions"
+            )
 
         # Compute IoU
-        iou_mat = box_iou(gt_boxes, pred_boxes)
-        if iou_mat.shape[1] == 0:
-            self.tot_iou = 0
-        else:
-            self.tot_iou += float(iou_mat.max(axis=1).sum())
-
-        # Assign pairs
-        gt_indices, preds_indices = assign_pairs(iou_mat, self.iou_thresh)
-
-        # Compare sequences
-        for gt_idx, pred_idx in zip(gt_indices, preds_indices):
-            dist = levenshtein(gt_labels[gt_idx], pred_labels[pred_idx])
-            self.tot_dist += dist
-            if dist <= self.max_dist:
-                self.num_reco_matches += 1
+        if pred_boxes.shape[0] > 0:
+            if self.use_polygons:
+                iou_mat = polygon_iou(gt_boxes, pred_boxes)
+            else:
+                iou_mat = box_iou(gt_boxes, pred_boxes)
+
+            self.tot_iou += float(iou_mat.max(axis=0).sum())
+
+            # Assign pairs
+            gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
+            is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
+            # String comparison
+            for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
+                _raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
+                self.raw_matches += int(_raw)
+                self.caseless_matches += int(_caseless)
+                self.anyascii_matches += int(_anyascii)
+                self.unicase_matches += int(_unicase)
+
+        self.num_gts += gt_boxes.shape[0]
+        self.num_preds += pred_boxes.shape[0]
+ + +
+[docs] + def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float]], Optional[float]]: + """Computes the aggregated metrics + + Returns + ------- + a tuple with the recall & precision for each string comparison and the mean IoU + """ + # Recall + recall = dict( + raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None, + caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None, + anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None, + unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None, + ) + + # Precision + precision = dict( + raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None, + caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None, + anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None, + unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None, + ) + + # mean IoU (overall detected boxes) + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None + + return recall, precision, mean_iou
+ + + def reset(self) -> None: + self.num_gts = 0 + self.num_preds = 0 + self.tot_iou = 0.0 + self.raw_matches = 0 + self.caseless_matches = 0 + self.anyascii_matches = 0 + self.unicase_matches = 0
+ + + +
+[docs] +class DetectionMetric: + r"""Implements an object detection metric. + + The aggregated metrics are computed as follows: + + .. math:: + \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, + \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ + Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + + with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and + :math:`y`, and the function :math:`h_{B, C}` defined as: + + .. math:: + \forall (b, c) \in \mathcal{B} \times \mathcal{C}, + h_{B,C}(b, c) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{C}` is the set of possible class indices, + :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. + + >>> import numpy as np + >>> from doctr.utils import DetectionMetric + >>> metric = DetectionMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) + >>> metric.summary() + + Args: + ---- + iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format + """ + + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: + self.iou_thresh = iou_thresh + self.use_polygons = use_polygons + self.reset() + +
+[docs] + def update( + self, + gt_boxes: np.ndarray, + pred_boxes: np.ndarray, + gt_labels: np.ndarray, + pred_labels: np.ndarray, + ) -> None: + """Updates the metric + + Args: + ---- + gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + gt_labels: an array of class indices of shape (N,) + pred_labels: an array of class indices of shape (M,) + """ + if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: + raise AssertionError( + "there should be the same number of boxes and string both for the ground truth and the predictions" + ) + + # Compute IoU + if pred_boxes.shape[0] > 0: + if self.use_polygons: + iou_mat = polygon_iou(gt_boxes, pred_boxes) + else: + iou_mat = box_iou(gt_boxes, pred_boxes) + + self.tot_iou += float(iou_mat.max(axis=0).sum()) + + # Assign pairs + gt_indices, pred_indices = linear_sum_assignment(-iou_mat) + is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh + # Category comparison + self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) - # Update counts - self.num_det_matches = len(gt_indices) self.num_gts += gt_boxes.shape[0] - self.num_preds += pred_boxes.shape[0] + self.num_preds += pred_boxes.shape[0]
+ - def summary(self) -> Tuple[float, float, float, float]: +
+[docs] + def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: + """Computes the aggregated metrics + Returns + ------- + a tuple with the recall & precision for each class prediction and the mean IoU + """ # Recall - recall = self.num_reco_matches / self.num_gts + recall = self.num_matches / self.num_gts if self.num_gts > 0 else None # Precision - precision = self.num_reco_matches / self.num_preds + precision = self.num_matches / self.num_preds if self.num_preds > 0 else None # mean IoU (overall detected boxes) - mean_iou = self.tot_iou / self.num_preds + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None - # mean distance (overall detection-matching boxes) - mean_distance = self.tot_dist / self.num_det_matches + return recall, precision, mean_iou
- return recall, precision, mean_iou, mean_distance def reset(self) -> None: self.num_gts = 0 self.num_preds = 0 - self.num_det_matches = 0 - self.num_reco_matches = 0 - self.tot_iou = 0. - self.tot_dist = 0
+ self.tot_iou = 0.0 + self.num_matches = 0
@@ -654,8 +935,8 @@

Source code for doctr.utils.metrics

       
     
   
-
- + + diff --git a/v0.2.0/_modules/doctr/utils/visualization.html b/v0.2.0/_modules/doctr/utils/visualization.html index 3e5bc073f8..c818be6d7b 100644 --- a/v0.2.0/_modules/doctr/utils/visualization.html +++ b/v0.2.0/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.utils.visualization

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import colorsys
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import matplotlib.pyplot as plt
-from matplotlib.figure import Figure
+import cv2
 import matplotlib.patches as patches
-import mplcursors
+import matplotlib.pyplot as plt
 import numpy as np
-from typing import Tuple, List, Dict, Any
+from matplotlib.figure import Figure
 
-from .common_types import BoundingBox
+from .common_types import BoundingBox, Polygon4P
 
-__all__ = ['visualize_page']
+__all__ = ["visualize_page", "visualize_kie_page", "draw_boxes"]
 
 
-def create_rect_patch(
+def rect_patch(
     geometry: BoundingBox,
-    label: str,
     page_dimensions: Tuple[int, int],
-    color: Tuple[int, int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
     alpha: float = 0.3,
     linewidth: int = 2,
     fill: bool = True,
-) -> patches.Patch:
-    """Create a matplotlib patch (rectangle) bounding the element
+    preserve_aspect_ratio: bool = False,
+) -> patches.Rectangle:
+    """Create a matplotlib rectangular patch for the element
 
     Args:
+    ----
         geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
         label: label to display when hovered
-        page_dimensions: dimensions of the Page
         color: color to draw box
         alpha: opacity parameter to fill the boxes, 0 = transparent
         linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
 
     Returns:
+    -------
         a rectangular Patch
     """
-    h, w = page_dimensions
+    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
+    height, width = page_dimensions
     (xmin, ymin), (xmax, ymax) = geometry
-    xmin, xmax = xmin * w, xmax * w
-    ymin, ymax = ymin * h, ymax * h
-    rect = patches.Rectangle(
+    # Switch to absolute coords
+    if preserve_aspect_ratio:
+        width = height = max(height, width)
+    xmin, w = xmin * width, (xmax - xmin) * width
+    ymin, h = ymin * height, (ymax - ymin) * height
+
+    return patches.Rectangle(
         (xmin, ymin),
-        xmax - xmin,
-        ymax - ymin,
+        w,
+        h,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def polygon_patch(
+    geometry: np.ndarray,
+    page_dimensions: Tuple[int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
+    alpha: float = 0.3,
+    linewidth: int = 2,
+    fill: bool = True,
+    preserve_aspect_ratio: bool = False,
+) -> patches.Polygon:
+    """Create a matplotlib polygon patch for the element
+
+    Args:
+    ----
+        geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
+        label: label to display when hovered
+        color: color to draw box
+        alpha: opacity parameter to fill the boxes, 0 = transparent
+        linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
+
+    Returns:
+    -------
+        a polygon Patch
+    """
+    if not geometry.shape == (4, 2):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
+    height, width = page_dimensions
+    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
+    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
+
+    return patches.Polygon(
+        geometry,
         fill=fill,
         linewidth=linewidth,
         edgecolor=(*color, alpha),
         facecolor=(*color, alpha),
-        label=label
+        label=label,
     )
-    return rect
+
+
+def create_obj_patch(
+    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
+    page_dimensions: Tuple[int, int],
+    **kwargs: Any,
+) -> patches.Patch:
+    """Create a matplotlib patch for the element
+
+    Args:
+    ----
+        geometry: bounding box (straight or rotated) of the element
+        page_dimensions: dimensions of the page in format (height, width)
+        **kwargs: keyword arguments for the patch
+
+    Returns:
+    -------
+        a matplotlib Patch
+    """
+    if isinstance(geometry, tuple):
+        if len(geometry) == 2:  # straight word BB (2 pts)
+            return rect_patch(geometry, page_dimensions, **kwargs)
+        elif len(geometry) == 4:  # rotated word BB (4 pts)
+            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)
+    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
+        return polygon_patch(geometry, page_dimensions, **kwargs)
+    raise ValueError("invalid geometry format")
+
+
+def get_colors(num_colors: int) -> List[Tuple[float, float, float]]:
+    """Generate num_colors color for matplotlib
+
+    Args:
+    ----
+        num_colors: number of colors to generate
+
+    Returns:
+    -------
+        colors: list of generated colors
+    """
+    colors = []
+    for i in np.arange(0.0, 360.0, 360.0 / num_colors):
+        hue = i / 360.0
+        lightness = (50 + np.random.rand() * 10) / 100.0
+        saturation = (90 + np.random.rand() * 10) / 100.0
+        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
+    return colors
 
 
 
-[docs] +[docs] def visualize_page( page: Dict[str, Any], image: np.ndarray, words_only: bool = True, + display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, @@ -338,22 +472,30 @@

Source code for doctr.utils.visualization

 ) -> Figure:
     """Visualize a full page with predicted blocks, lines and words
 
-    Example::
-        >>> import numpy as np
-        >>> import matplotlib.pyplot as plt
-        >>> from doctr.utils.visualization import visualize_page
-        >>> from doctr.models import ocr_db_crnn
-        >>> model = ocr_db_crnn(pretrained=True)
-        >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
-        >>> out = model([[input_page]])
-        >>> visualize_page(out[0].pages[0].export(), input_page)
-        >>> plt.show()
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from doctr.utils.visualization import visualize_page
+    >>> from doctr.models import ocr_db_crnn
+    >>> model = ocr_db_crnn(pretrained=True)
+    >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
+    >>> out = model([[input_page]])
+    >>> visualize_page(out[0].pages[0].export(), input_page)
+    >>> plt.show()
 
     Args:
+    ----
         page: the exported Page of a Document
         image: np array of the page, needs to have the same shape than page['dimensions']
         words_only: whether only words should be displayed
+        display_artefacts: whether artefacts should be displayed
         scale: figsize of the largest windows side
+        interactive: whether the plot should be interactive
+        add_labels: for static plot, adds text labels on top of bounding box
+        **kwargs: keyword arguments for the polygon patch
+
+    Returns:
+    -------
+        the matplotlib figure
     """
     # Get proper scale and aspect ratio
     h, w = image.shape[:2]
@@ -362,58 +504,189 @@ 

Source code for doctr.utils.visualization

     # Display the image
     ax.imshow(image)
     # hide both axis
-    ax.axis('off')
+    ax.axis("off")
 
     if interactive:
         artists: List[patches.Patch] = []  # instantiate an empty list of patches (to be drawn on the page)
 
-    for block in page['blocks']:
+    for block in page["blocks"]:
         if not words_only:
-            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
+            rect = create_obj_patch(
+                block["geometry"], page["dimensions"], label="block", color=(0, 1, 0), linewidth=1, **kwargs
+            )
             # add patch on figure
             ax.add_patch(rect)
             if interactive:
                 # add patch to cursor's artists
                 artists.append(rect)
 
-        for line in block['lines']:
+        for line in block["lines"]:
             if not words_only:
-                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
+                rect = create_obj_patch(
+                    line["geometry"], page["dimensions"], label="line", color=(1, 0, 0), linewidth=1, **kwargs
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
-            for word in line['words']:
-                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
-                                         page['dimensions'], (0, 0, 1), **kwargs)
+            for word in line["words"]:
+                rect = create_obj_patch(
+                    word["geometry"],
+                    page["dimensions"],
+                    label=f"{word['value']} (confidence: {word['confidence']:.2%})",
+                    color=(0, 0, 1),
+                    **kwargs,
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
                 elif add_labels:
-                    ax.text(
-                        int(page['dimensions'][1] * word['geometry'][0][0]),
-                        int(page['dimensions'][0] * word['geometry'][0][1]),
-                        word['value'],
-                        size=10,
-                        alpha=0.5,
-                        color=(0, 0, 1),
-                    )
+                    if len(word["geometry"]) == 5:
+                        text_loc = (
+                            int(page["dimensions"][1] * (word["geometry"][0] - word["geometry"][2] / 2)),
+                            int(page["dimensions"][0] * (word["geometry"][1] - word["geometry"][3] / 2)),
+                        )
+                    else:
+                        text_loc = (
+                            int(page["dimensions"][1] * word["geometry"][0][0]),
+                            int(page["dimensions"][0] * word["geometry"][0][1]),
+                        )
 
-        if not words_only:
-            for artefact in block['artefacts']:
-                rect = create_rect_patch(artefact['geometry'], 'artefact', page['dimensions'], (0.5, 0.5, 0.5),
-                                         linewidth=1, **kwargs)
+                    if len(word["geometry"]) == 2:
+                        # We draw only if boxes are in straight format
+                        ax.text(
+                            *text_loc,
+                            word["value"],
+                            size=10,
+                            alpha=0.5,
+                            color=(0, 0, 1),
+                        )
+
+        if display_artefacts:
+            for artefact in block["artefacts"]:
+                rect = create_obj_patch(
+                    artefact["geometry"],
+                    page["dimensions"],
+                    label="artefact",
+                    color=(0.5, 0.5, 0.5),
+                    linewidth=1,
+                    **kwargs,
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
     if interactive:
+        import mplcursors
+
         # Create mlp Cursor to hover patches in artists
         mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
-    fig.tight_layout()
+    fig.tight_layout(pad=0.0)
 
     return fig
+ + +def visualize_kie_page( + page: Dict[str, Any], + image: np.ndarray, + words_only: bool = False, + display_artefacts: bool = True, + scale: float = 10, + interactive: bool = True, + add_labels: bool = True, + **kwargs: Any, +) -> Figure: + """Visualize a full page with predicted blocks, lines and words + + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from doctr.utils.visualization import visualize_page + >>> from doctr.models import ocr_db_crnn + >>> model = ocr_db_crnn(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([[input_page]]) + >>> visualize_kie_page(out[0].pages[0].export(), input_page) + >>> plt.show() + + Args: + ---- + page: the exported Page of a Document + image: np array of the page, needs to have the same shape than page['dimensions'] + words_only: whether only words should be displayed + display_artefacts: whether artefacts should be displayed + scale: figsize of the largest windows side + interactive: whether the plot should be interactive + add_labels: for static plot, adds text labels on top of bounding box + **kwargs: keyword arguments for the polygon patch + + Returns: + ------- + the matplotlib figure + """ + # Get proper scale and aspect ratio + h, w = image.shape[:2] + size = (scale * w / h, scale) if h > w else (scale, h / w * scale) + fig, ax = plt.subplots(figsize=size) + # Display the image + ax.imshow(image) + # hide both axis + ax.axis("off") + + if interactive: + artists: List[patches.Patch] = [] # instantiate an empty list of patches (to be drawn on the page) + + colors = {k: color for color, k in zip(get_colors(len(page["predictions"])), page["predictions"])} + for key, value in page["predictions"].items(): + for prediction in value: + if not words_only: + rect = create_obj_patch( + prediction["geometry"], + page["dimensions"], + label=f"{key} \n {prediction['value']} (confidence: {prediction['confidence']:.2%}", + color=colors[key], + linewidth=1, + **kwargs, + ) + # add patch on figure + ax.add_patch(rect) + if interactive: + # add patch to cursor's artists + artists.append(rect) + + if interactive: + import mplcursors + + # Create mlp Cursor to hover patches in artists + mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label())) + fig.tight_layout(pad=0.0) + + return fig + + +def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None: + """Draw an array of relative straight boxes on an image + + Args: + ---- + boxes: array of relative boxes, of shape (*, 4) + image: np array, float32 or uint8 + color: color to use for bounding box edges + **kwargs: keyword arguments from `matplotlib.pyplot.plot` + """ + h, w = image.shape[:2] + # Convert boxes to absolute coords + _boxes = deepcopy(boxes) + _boxes[:, [0, 2]] *= w + _boxes[:, [1, 3]] *= h + _boxes = _boxes.astype(np.int32) + for box in _boxes.tolist(): + xmin, ymin, xmax, ymax = box + image = cv2.rectangle( + image, (xmin, ymin), (xmax, ymax), color=color if isinstance(color, tuple) else (0, 0, 255), thickness=2 + ) + plt.imshow(image) + plt.plot(**kwargs)
@@ -446,8 +719,8 @@

Source code for doctr.utils.visualization

       
     
   
- - + + diff --git a/v0.2.0/_modules/index.html b/v0.2.0/_modules/index.html index dc72311281..5793c44f20 100644 --- a/v0.2.0/_modules/index.html +++ b/v0.2.0/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -225,15 +225,42 @@ - - + + diff --git a/v0.2.0/_sources/datasets.rst.txt b/v0.2.0/_sources/datasets.rst.txt deleted file mode 100644 index d2080bc034..0000000000 --- a/v0.2.0/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.core.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -..autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.2.0/_sources/documents.rst.txt b/v0.2.0/_sources/documents.rst.txt deleted file mode 100644 index e2fa11b344..0000000000 --- a/v0.2.0/_sources/documents.rst.txt +++ /dev/null @@ -1,83 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.2.0/_sources/getting_started/installing.rst.txt b/v0.2.0/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/v0.2.0/_sources/getting_started/installing.rst.txt +++ b/v0.2.0/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/v0.2.0/_sources/index.rst.txt b/v0.2.0/_sources/index.rst.txt index a7d5ef909e..53251db142 100644 --- a/v0.2.0/_sources/index.rst.txt +++ b/v0.2.0/_sources/index.rst.txt @@ -1,75 +1,122 @@ -DocTR: Document Text Recognition -================================ +******************************** +docTR: Document Text Recognition +******************************** + +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch + +.. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png + :align: center -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 DocTR provides an easy and powerful way to extract valuable information from your documents: -* |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. +* |:receipt:| **for automation**: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. -This is the documentation of our repository `doctr `_. +Main Features +------------- -Features --------- - -* |:robot:| Robust 2-stages (detection + recognition) OCR predictors fully trained +* |:robot:| Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor -* |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract -* |:zap:| Predictors optimized to be very fast on both CPU & GPU -* |:bird:| Light package, small dependencies -* |:tools:| Daily maintained -* |:factory:| Easily integrable - +* |:rocket:| State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract +* |:zap:| Optimized for inference speed on both CPU & GPU +* |:bird:| Light package, minimal dependencies +* |:tools:| Actively maintained by Mindee +* |:factory:| Easy integration (available templates for browser demo & API deployment) -|:scientist:| Build & train your predictor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* |:construction_worker:| Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) -* |:construction_worker:| Fine-tune or train from scratch any detection or recognition model to specialize on your data +.. toctree:: + :maxdepth: 2 + :caption: Getting started + :hidden: + + getting_started/installing + notebooks + + +Model zoo +^^^^^^^^^ + +Text detection models +""""""""""""""""""""" +* DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ +* LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ +* FAST from `"FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation" `_ + +Text recognition models +""""""""""""""""""""""" +* SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ +* CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ +* MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ +* ViTSTR from `"Vision Transformer for Fast and Efficient Scene Text Recognition" `_ +* PARSeq from `"Scene Text Recognition with Permuted Autoregressive Sequence Models" `_ + + +Supported datasets +^^^^^^^^^^^^^^^^^^ +* FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. +* CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. +* SROIE from `ICDAR 2019 `_. +* IIIT-5k from `CVIT `_. +* Street View Text from `"End-to-End Scene Text Recognition" `_. +* SynthText from `Visual Geometry Group `_. +* SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. +* IC03 from `ICDAR 2003 `_. +* IC13 from `ICDAR 2013 `_. +* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. +* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" `_. +* IIITHWS from `"Generating Synthetic Data for Text Recognition" `_. +* WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" `_. -|:toolbox:| Implemented models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Detection models -"""""""""""""""" - * DB (Differentiable Binarization), `"Real-time Scene Text Detection with Differentiable Binarization" `_. - * LinkNet, `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_. +.. toctree:: + :maxdepth: 2 + :caption: Using docTR + :hidden: -Recognition models -"""""""""""""""""" - * SAR (Show, Attend and Read), `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition" `_. - * CRNN (Convolutional Recurrent Neural Network), `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_. + using_doctr/using_models + using_doctr/using_datasets + using_doctr/using_contrib_modules + using_doctr/sharing_models + using_doctr/using_model_export + using_doctr/custom_models_training + using_doctr/running_on_aws -|:receipt:| Integrated datasets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. - * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. +.. toctree:: + :maxdepth: 2 + :caption: Community + :hidden: + community/resources -Getting Started ---------------- .. toctree:: :maxdepth: 2 + :caption: Package Reference + :hidden: - installing + modules/contrib + modules/datasets + modules/io + modules/models + modules/transforms + modules/utils -Contents --------- - .. toctree:: - :maxdepth: 1 + :maxdepth: 2 + :caption: Contributing + :hidden: - datasets - documents - models - transforms - utils + contributing/code_of_conduct + contributing/contributing -.. automodule:: doctr - :members: +.. toctree:: + :maxdepth: 2 + :caption: Notes + :hidden: + + changelog diff --git a/v0.2.0/_sources/installing.rst.txt b/v0.2.0/_sources/installing.rst.txt deleted file mode 100644 index ee7de4dbc0..0000000000 --- a/v0.2.0/_sources/installing.rst.txt +++ /dev/null @@ -1,26 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or newer. - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.2.0/_sources/models.rst.txt b/v0.2.0/_sources/models.rst.txt deleted file mode 100644 index 410e9604f7..0000000000 --- a/v0.2.0/_sources/models.rst.txt +++ /dev/null @@ -1,209 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 3 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend. -* PostProcessor: making model outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | | 0.733 | 0.817 | 0.745 | 0.875 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet - - -Post-processing detections -^^^^^^^^^^^^^^^^^^^^^^^^^^ -The purpose of this block is to turn the model output (binary segmentation map for instance), into a set of bounding boxes. - - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - - - 0.860 - - 0.913 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - - - 0.862 - - 0.917 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - - - **0.863** - - **0.921** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 - -Post-processing outputs -^^^^^^^^^^^^^^^^^^^^^^^ -The purpose of this block is to turn the model output (symbol classification for the sequence), into a set of strings. - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+--------------------------------------------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+=================+==============+============+===============+=========+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+-----------------+--------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | (1024, 1024, 3) | | 0.629 | 0.701 | 0.85 | 0.664 | 0.780 | 1.6 | -+-----------------------------+-----------------+--------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | (1024, 1024, 3) | | 0.630 | 0.702 | 0.49 | 0.666 | 0.783 | 1.0 | -+-----------------------------+-----------------+--------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | (1024, 1024, 3) | | 0.640 | 0.713 | 0.27 | 0.672 | **0.789** | 0.83 | -+-----------------------------+-----------------+--------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | NA | | 0.595 | 0.625 | | 0.753 | 0.700 | | -+-----------------------------+-----------------+--------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | NA | | 0.640 | 0.533 | | 0.689 | 0.611 | | -+-----------------------------+-----------------+--------------+------------+---------------+---------+------------+---------------+---------+ -| aws textract | NA | | **0.781** | **0.830** | | **0.875** | 0.660 | | -+-----------------------------+-----------------+--------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.2.0/_sources/transforms.rst.txt b/v0.2.0/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.2.0/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.2.0/_sources/utils.rst.txt b/v0.2.0/_sources/utils.rst.txt deleted file mode 100644 index 1a02858378..0000000000 --- a/v0.2.0/_sources/utils.rst.txt +++ /dev/null @@ -1,30 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: ExactMatch - -.. autoclass:: LocalizationConfusion - -.. autoclass:: OCRMetric diff --git a/v0.2.0/_static/basic.css b/v0.2.0/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.2.0/_static/basic.css +++ b/v0.2.0/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.2.0/_static/doctools.js b/v0.2.0/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.2.0/_static/doctools.js +++ b/v0.2.0/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.2.0/_static/documentation_options.js b/v0.2.0/_static/documentation_options.js index 40b838b240..4f656fdbea 100644 --- a/v0.2.0/_static/documentation_options.js +++ b/v0.2.0/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.1.2a0-git', + VERSION: '0.10.1a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.2.0/_static/language_data.js b/v0.2.0/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.2.0/_static/language_data.js +++ b/v0.2.0/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.2.0/_static/searchtools.js b/v0.2.0/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.2.0/_static/searchtools.js +++ b/v0.2.0/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.2.0/changelog.html b/v0.2.0/changelog.html index ac81a6f231..fc45a50384 100644 --- a/v0.2.0/changelog.html +++ b/v0.2.0/changelog.html @@ -14,7 +14,7 @@ - + Changelog - docTR documentation @@ -446,7 +446,7 @@

v0.1.0 (2021-03-05) - + diff --git a/v0.2.0/community/resources.html b/v0.2.0/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.2.0/community/resources.html +++ b/v0.2.0/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.2.0/contributing/code_of_conduct.html b/v0.2.0/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/v0.2.0/contributing/code_of_conduct.html +++ b/v0.2.0/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/v0.2.0/contributing/contributing.html b/v0.2.0/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/v0.2.0/contributing/contributing.html +++ b/v0.2.0/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/v0.2.0/datasets.html b/v0.2.0/datasets.html deleted file mode 100644 index 766f224a12..0000000000 --- a/v0.2.0/datasets.html +++ /dev/null @@ -1,564 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.core.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-

Implements an abstract dataset

-
-
Parameters:
-
    -
  • url – URL of the dataset

  • -
  • file_name – name of the file once downloaded

  • -
  • file_hash – expected SHA256 of the file

  • -
  • extract_archive – whether the downloaded file is an archive to be extracted

  • -
  • download – whether the dataset should be downloaded if not present on disk

  • -
  • overwrite – whether the archive should be re-extracted

  • -
-
-
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Tensor], Tensor] | None = None, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Tensor], Tensor] | None = None, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Tensor], Tensor] | None = None, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -

..autoclass:: OCRDataset

-
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/documents.html b/v0.2.0/documents.html deleted file mode 100644 index a7450d8048..0000000000 --- a/v0.2.0/documents.html +++ /dev/null @@ -1,736 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/genindex.html b/v0.2.0/genindex.html index 7be65c62d4..21520455b4 100644 --- a/v0.2.0/genindex.html +++ b/v0.2.0/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -224,15 +224,42 @@

- -
-

Q

-
@@ -478,31 +559,53 @@

Q

R

@@ -512,13 +615,33 @@

R

S

@@ -528,8 +651,36 @@

S

T

+ +
+
+ +
+

U

+ +
@@ -538,11 +689,19 @@

T

V

@@ -552,7 +711,13 @@

V

W

+
@@ -590,8 +755,8 @@

W

- - + + diff --git a/v0.2.0/getting_started/installing.html b/v0.2.0/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/v0.2.0/getting_started/installing.html +++ b/v0.2.0/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/v0.2.0/index.html b/v0.2.0/index.html index 19218e24cf..3a06afc6d9 100644 --- a/v0.2.0/index.html +++ b/v0.2.0/index.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + docTR documentation @@ -226,15 +226,42 @@
-

DocTR: Document Text Recognition

-

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2

+

docTR: Document Text Recognition

+

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch

+https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png

DocTR provides an easy and powerful way to extract valuable information from your documents:

    -
  • 🧾 for automation: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • +
  • 🧾 for automation: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • 👩‍🔬 for research: quickly compare your own architectures speed & performances with state-of-art models on public datasets.

-

This is the documentation of our repository doctr.

-
-

Features

+
+

Main Features

    -
  • 🤖 Robust 2-stages (detection + recognition) OCR predictors fully trained

  • +
  • 🤖 Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters

  • ⚡ User-friendly, 3 lines of code to load a document and extract text with a predictor

  • -
  • 🚀 State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract

  • -
  • ⚡ Predictors optimized to be very fast on both CPU & GPU

  • -
  • 🐦 Light package, small dependencies

  • -
  • 🛠️ Daily maintained

  • -
  • 🏭 Easily integrable

  • +
  • 🚀 State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract

  • +
  • ⚡ Optimized for inference speed on both CPU & GPU

  • +
  • 🐦 Light package, minimal dependencies

  • +
  • 🛠️ Actively maintained by Mindee

  • +
  • 🏭 Easy integration (available templates for browser demo & API deployment)

-
-

🧑‍🔬 Build & train your predictor

+
+
+
+

Model zoo

+
+

Text detection models

    -
  • 👷 Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained)

  • -
  • 👷 Fine-tune or train from scratch any detection or recognition model to specialize on your data

  • -
-
-
-

🧰 Implemented models

-
-

Detection models

-
-
-
-

Recognition models

-
-
-
-

🧾 Integrated datasets

-
-
-
-
-
-

Getting Started

-
-
-

Contents

-
+
+
+
+
+
+
+
+
@@ -364,7 +381,7 @@

Contents
- + diff --git a/v0.2.0/modules/io.html b/v0.2.0/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/v0.2.0/modules/io.html +++ b/v0.2.0/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/v0.2.0/modules/models.html b/v0.2.0/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/v0.2.0/modules/models.html +++ b/v0.2.0/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/v0.2.0/modules/transforms.html b/v0.2.0/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/v0.2.0/modules/transforms.html +++ b/v0.2.0/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/v0.2.0/modules/utils.html b/v0.2.0/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/v0.2.0/modules/utils.html +++ b/v0.2.0/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/v0.2.0/notebooks.html b/v0.2.0/notebooks.html index f97771aebb..d36539f59e 100644 --- a/v0.2.0/notebooks.html +++ b/v0.2.0/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/v0.2.0/objects.inv b/v0.2.0/objects.inv index 35f61a1448..c1700f291b 100644 Binary files a/v0.2.0/objects.inv and b/v0.2.0/objects.inv differ diff --git a/v0.2.0/py-modindex.html b/v0.2.0/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.2.0/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.2.0/search.html b/v0.2.0/search.html index d4ae2e99e8..d050f5eac7 100644 --- a/v0.2.0/search.html +++ b/v0.2.0/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -226,15 +226,42 @@ - - + + diff --git a/v0.2.0/searchindex.js b/v0.2.0/searchindex.js index a97cd6ba72..6f154115ab 100644 --- a/v0.2.0/searchindex.js +++ b/v0.2.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Artefact": [[1, "artefact"]], "Available Datasets": [[0, "available-datasets"]], "Block": [[1, "block"]], "Composing transformations": [[5, "composing-transformations"]], "Contents": [[2, "contents"]], "Data Loading": [[0, "data-loading"]], "Detection models": [[2, "detection-models"], [4, "detection-models"]], "Detection predictors": [[4, "detection-predictors"]], "DocTR Vocabs": [[0, "id1"]], "DocTR: Document Text Recognition": [[2, null]], "Document": [[1, "document"]], "Document structure": [[1, "document-structure"]], "End-to-End OCR": [[4, "end-to-end-ocr"]], "Features": [[2, "features"]], "File reading": [[1, "file-reading"]], "Getting Started": [[2, "getting-started"]], "Installation": [[3, null]], "Line": [[1, "line"]], "Model compression": [[4, "model-compression"]], "Model export": [[4, "model-export"]], "Page": [[1, "page"]], "Post-processing detections": [[4, "post-processing-detections"]], "Post-processing outputs": [[4, "post-processing-outputs"]], "Pre-processing for detection": [[4, "pre-processing-for-detection"]], "Pre-processing for recognition": [[4, "pre-processing-for-recognition"]], "Recognition models": [[2, "recognition-models"], [4, "recognition-models"]], "Recognition predictors": [[4, "recognition-predictors"]], "Supported Vocabs": [[0, "supported-vocabs"]], "Supported transformations": [[5, "supported-transformations"]], "Task evaluation": [[6, "task-evaluation"]], "Text Detection": [[4, "text-detection"]], "Text Recognition": [[4, "text-recognition"]], "Text recognition model zoo": [[4, "id2"]], "Two-stage approaches": [[4, "two-stage-approaches"]], "Using SavedModel": [[4, "using-savedmodel"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[6, "visualization"]], "Word": [[1, "word"]], "doctr.datasets": [[0, null]], "doctr.documents": [[1, null]], "doctr.models": [[4, null]], "doctr.transforms": [[5, null]], "doctr.utils": [[6, null]], "\ud83e\uddd1\u200d\ud83d\udd2c Build & train your predictor": [[2, "scientist-build-train-your-predictor"]], "\ud83e\uddf0 Implemented models": [[2, "toolbox-implemented-models"]], "\ud83e\uddfe Integrated datasets": [[2, "receipt-integrated-datasets"]]}, "docnames": ["datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[1, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[1, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[1, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[5, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[5, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[4, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[4, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[0, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[4, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[0, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[4, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[4, "doctr.models.detection.detection_predictor", false]], "doctr": [[2, "module-doctr", false]], "document (class in doctr.documents)": [[1, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[1, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[0, "doctr.datasets.encode_sequences", false]], "exactmatch (class in doctr.utils.metrics)": [[6, "doctr.utils.metrics.ExactMatch", false]], "from_images() (doctr.documents.documentfile class method)": [[1, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[1, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[1, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[0, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[1, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[1, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[5, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[1, "doctr.documents.Line", false]], "linknet() (in module doctr.models.detection)": [[4, "doctr.models.detection.linknet", false]], "localizationconfusion (class in doctr.utils.metrics)": [[6, "doctr.utils.metrics.LocalizationConfusion", false]], "module": [[2, "module-doctr", false]], "normalize (class in doctr.transforms)": [[5, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[4, "doctr.models.zoo.ocr_predictor", false]], "ocrmetric (class in doctr.utils.metrics)": [[6, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[5, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[1, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[1, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[4, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[5, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[5, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[5, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[5, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[5, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[5, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[5, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[1, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[1, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[1, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[4, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[5, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[4, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[4, "doctr.models.recognition.sar_vgg16_bn", false]], "sroie (class in doctr.datasets)": [[0, "doctr.datasets.SROIE", false]], "togray (class in doctr.transforms)": [[5, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.core)": [[0, "doctr.datasets.core.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[6, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[1, "doctr.documents.Word", false]]}, "objects": {"": [[2, 0, 0, "-", "doctr"]], "doctr.datasets": [[0, 1, 1, "", "CORD"], [0, 1, 1, "", "FUNSD"], [0, 1, 1, "", "SROIE"], [0, 2, 1, "", "encode_sequences"]], "doctr.datasets.core": [[0, 1, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[0, 1, 1, "", "DataLoader"]], "doctr.documents": [[1, 1, 1, "", "Artefact"], [1, 1, 1, "", "Block"], [1, 1, 1, "", "Document"], [1, 1, 1, "", "DocumentFile"], [1, 1, 1, "", "Line"], [1, 1, 1, "", "PDF"], [1, 1, 1, "", "Page"], [1, 1, 1, "", "Word"], [1, 2, 1, "", "read_html"], [1, 2, 1, "", "read_img"], [1, 2, 1, "", "read_pdf"]], "doctr.documents.DocumentFile": [[1, 3, 1, "", "from_images"], [1, 3, 1, "", "from_pdf"], [1, 3, 1, "", "from_url"]], "doctr.documents.PDF": [[1, 3, 1, "", "as_images"], [1, 3, 1, "", "get_artefacts"], [1, 3, 1, "", "get_words"]], "doctr.models.detection": [[4, 2, 1, "", "db_resnet50"], [4, 2, 1, "", "detection_predictor"], [4, 2, 1, "", "linknet"]], "doctr.models.export": [[4, 2, 1, "", "convert_to_fp16"], [4, 2, 1, "", "convert_to_tflite"], [4, 2, 1, "", "quantize_model"]], "doctr.models.recognition": [[4, 2, 1, "", "crnn_vgg16_bn"], [4, 2, 1, "", "recognition_predictor"], [4, 2, 1, "", "sar_resnet31"], [4, 2, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[4, 2, 1, "", "ocr_predictor"]], "doctr.transforms": [[5, 1, 1, "", "ColorInversion"], [5, 1, 1, "", "Compose"], [5, 1, 1, "", "LambdaTransformation"], [5, 1, 1, "", "Normalize"], [5, 1, 1, "", "OneOf"], [5, 1, 1, "", "RandomApply"], [5, 1, 1, "", "RandomBrightness"], [5, 1, 1, "", "RandomContrast"], [5, 1, 1, "", "RandomGamma"], [5, 1, 1, "", "RandomHue"], [5, 1, 1, "", "RandomJpegQuality"], [5, 1, 1, "", "RandomSaturation"], [5, 1, 1, "", "Resize"], [5, 1, 1, "", "ToGray"]], "doctr.utils.metrics": [[6, 1, 1, "", "ExactMatch"], [6, 1, 1, "", "LocalizationConfusion"], [6, 1, 1, "", "OCRMetric"]], "doctr.utils.visualization": [[6, 2, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "function", "Python function"], "3": ["py", "method", "Python method"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:function", "3": "py:method"}, "terms": {"": [1, 6], "0": [0, 4, 5, 6], "00": [], "01": [], "0123456789": 0, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 0, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02562": [], "03": [], "035": [], "0361328125": [], "04": [], "05": [], "06": [], "06640625": [], "07": [], "08": [], "09": [], "0966796875": [], "1": [0, 4, 5, 6], "10": [0, 6], "100": [4, 5, 6], "1000": 4, "101": [], "1024": 4, "104": [], "106": [], "108": [], "1095": [], "11": [], "110": 6, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 4, "120": [], "123": [], "126": [], "1268": [], "128": 4, "13": [], "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": [], "1420": [], "14470v1": [], "149": [], "15": [], "150": 6, "154": 0, "1552": [], "16": 4, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": [], "185546875": [], "1900": [], "1910": [], "19342": [], "19370": [], "195": [], "19598": [], "199": 4, "1999": [], "2": [2, 4, 5, 6], "20": [], "200": 6, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": [], "2023": [], "207901": [], "21": [], "2103": [], "2186": [], "21888": [], "22": [], "224": [4, 5], "225": 5, "22672": [], "229": 5, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": [], "2504": [], "255": [4, 5, 6], "256": 4, "257": [], "26": [], "26032": [], "264": [], "27": 4, "2700": [], "2710": [], "2749": [], "28": [], "287": [], "29": [], "296": [], "299": [], "2d": [], "3": [1, 2, 3, 4, 5, 6], "30": [], "300": [], "3000": [], "301": [], "30595": 4, "30ghz": [], "31": 4, "32": [0, 4, 5], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": [], "4": [], "40": [], "406": 5, "41": [], "42": [], "43": [], "44": [], "45": [], "456": 5, "46": [], "47": [], "472": [], "48": [], "485": 5, "49": 4, "49377": [], "5": [0, 5, 6], "50": 4, "51": [], "51171875": [], "512": [], "52": 0, "529": [], "53": [], "533": 4, "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": [], "595": 4, "597": [], "5k": [], "5m": [], "6": [3, 4, 5], "60": 5, "600": [4, 6], "61": [], "611": 4, "62": [], "625": 4, "626": [], "629": 4, "63": [], "630": 4, "64": [4, 5], "640": 4, "641": [], "647": [], "65": [], "66": [], "660": 4, "664": 4, "666": 4, "67": [], "672": 4, "68": [], "689": 4, "69": [], "693": [], "694": [], "695": [], "6m": [], "7": 4, "70": 6, "700": 4, "701": 4, "702": 4, "707470": [], "71": [], "7100000": [], "713": 4, "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": 4, "74": [], "745": 4, "75": [], "753": 4, "7581382": [], "76": [], "77": [], "772": [], "772875": [], "78": [], "780": 4, "781": 4, "783": 4, "785": [], "789": 4, "79": [], "793533": [], "796": [], "798": [], "7m": [], "8": [4, 5], "80": [], "800": [4, 6], "81": [], "817": 4, "82": [], "8275l": 4, "83": 4, "830": 4, "84": [], "849": [], "85": 4, "8564453125": [], "857": [], "85875": [], "86": [], "860": 4, "8603515625": [], "862": 4, "863": 4, "87": [], "8707": [], "875": 4, "88": [], "89": [], "9": [], "90": [], "90k": [], "90kdict32px": [], "91": [], "913": 4, "914085328578949": [], "917": 4, "92": [], "921": 4, "93": [], "94": [], "95": 6, "9578408598899841": [], "96": 0, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [0, 1, 2, 4], "And": 4, "As": [], "Be": [], "Being": [], "By": [], "For": 4, "If": [1, 4], "In": 4, "It": 5, "Its": [2, 4], "No": [], "Of": 0, "Or": [], "The": [0, 1, 4, 6], "Then": 4, "To": [], "_": [0, 4], "__call__": [], "_build": [], "_i": 6, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 0, "abdef": [], "abl": [], "about": 4, "abov": 4, "abstract": 0, "abstractdataset": [], "abus": [], "accent": 6, "accept": [], "access": [0, 1, 2], "account": [], "accur": [], "accuraci": 6, "achiev": [], "act": [], "action": [], "activ": [], "ad": 5, "adapt": [], "add": 5, "add_hook": [], "add_label": 6, "addit": [], "addition": 4, "address": 1, "adjust": 5, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [0, 6], "aggress": [], "align": 1, "all": [0, 1, 2, 4, 5, 6], "allow": [], "along": [], "alreadi": [], "also": [], "alwai": [], "an": [0, 1, 2, 4, 6], "analysi": [1, 4], "ancient_greek": [], "andrej": [], "angl": 1, "ani": [0, 1, 2, 4, 6], "annot": 1, "anot": [], "anoth": [0, 4], "answer": [], "anyascii": [], "anyon": 2, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [0, 5], "applic": [2, 4], "appoint": [], "appreci": [], "appropri": [], "ar": [0, 1, 4, 5, 6], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 4, "architectur": [2, 4], "archiv": 0, "area": [], "argument": [0, 1], "around": 4, "arrai": 6, "art": 2, "artefact": [], "artefact_typ": 1, "articl": [], "artifici": [], "arxiv": [], "as_imag": 1, "asarrai": 6, "ascii_lett": 0, "aspect": 5, "assess": 6, "assign": 6, "associ": 1, "assum": [], "assume_straight_pag": [], "astyp": [4, 6], "attack": [], "attend": [2, 4], "attent": [], "autoclass": 0, "autom": 2, "automat": [], "autoregress": [], "avail": [4, 5], "averag": [4, 5], "avoid": [], "aw": [2, 4], "awar": [], "azur": [], "b": 6, "b_j": 6, "back": [], "backbon": 4, "backend": 4, "background": [], "bangla": [], "bar": [], "bar_cod": [], "baranovskij": [], "base": [2, 4], "baselin": [2, 4], "batch": [0, 4, 5], "batch_siz": 0, "bblanchon": [], "bbox": [], "becaus": [], "been": [4, 6], "befor": 0, "begin": 6, "behavior": [], "being": [4, 6], "belong": [], "benchmark": [], "best": [], "better": [], "between": [5, 6], "bgr": 1, "bilinear": [4, 5], "bin_thresh": [], "binar": [2, 4], "binari": [1, 4], "bit": [], "block": [4, 6], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [0, 1, 4, 5, 6], "boolean": [], "both": [2, 4, 5], "bottom": [], "bound": [1, 4, 5, 6], "box": [1, 4, 6], "box_thresh": [], "bright": 5, "browser": [], "build": [], "built": [], "byte": [1, 4], "c": [], "c5": 4, "c_j": [], "cach": [], "cache_sampl": [], "call": [], "callabl": [0, 5], "can": [0, 4], "capabl": 4, "case": 6, "cf": 4, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "channel": [1, 4, 5], "channel_prior": [], "channelshuffl": [], "charact": [0, 1, 2, 4, 6], "charactergener": [], "characterist": [], "charg": 4, "charset": [], "chart": 1, "check": [], "checkpoint": [], "chip": [], "christian": [], "ci": [], "clarifi": [], "clariti": [], "class": [0, 1, 5, 6], "class_nam": [], "classif": 4, "classmethod": 1, "clear": [], "clone": 3, "close": [], "co": [], "code": [1, 2], "codecov": [], "colab": [], "collate_fn": [], "collect": 1, "color": 5, "colorinvers": 5, "column": 1, "com": [1, 3], "combin": 4, "command": [], "comment": [], "commit": [], "common": [5, 6], "commun": [], "compar": 2, "comparison": [], "competit": 0, "compil": [], "complaint": [], "complementari": 6, "complet": [], "compon": 4, "compos": [0, 2, 4], "comprehens": [], "comput": [4, 6], "conf_threshold": [], "confid": 1, "config": [], "configur": [], "confus": 6, "consecut": [4, 5], "consequ": [], "consid": [1, 6], "consist": [], "consolid": [0, 2], "constant": 5, "construct": [], "contact": [], "contain": [], "content": [0, 1], "context": [], "contib": [], "continu": [], "contrast": 5, "contrast_factor": 5, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 4, "convers": 1, "convert": [1, 4, 5], "convert_page_to_numpi": 1, "convert_to_fp16": 4, "convert_to_tflit": 4, "convolut": 2, "cool": [], "coordin": 1, "cord": [0, 2, 4], "core": [0, 6], "corner": [], "correct": 5, "correspond": 4, "could": [], "counterpart": [], "cover": [], "coverag": [], "cpu": [2, 4], "creat": [], "crnn": [2, 4], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 4, "crnn_vgg16_bn": 4, "crop": 4, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 0, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 2, "danish": [], "data": [1, 2, 4, 5], "dataload": 0, "dataset": 4, "dataset_info": [], "date": [], "db": 2, "db_crnn_resnet": 4, "db_crnn_vgg": 4, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 4, "db_sar_resnet": 4, "db_sar_vgg": 4, "dbnet": 4, "deal": [], "decis": [], "decod": 1, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 4, "def": [], "default": [1, 4], "defer": 0, "defin": 6, "deform": 4, "degre": [], "degress": 1, "delet": [], "delimit": [], "delta": 5, "demo": [], "demonstr": [], "depend": 2, "deploi": [], "deploy": [], "derogatori": [], "describ": 4, "descript": [], "design": 5, "desir": [], "det_arch": 4, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 4, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 4, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 3, "deviat": 5, "devic": [], "dict": [1, 6], "dictionari": 1, "differ": [], "differenti": [2, 4], "digit": 0, "dimens": [1, 4, 6], "dimension": 5, "direct": [], "directli": 4, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 4, "discuss": [], "disk": 0, "disparag": [], "displai": 6, "display_artefact": [], "distanc": 6, "distribut": 5, "div": [], "divers": [], "divid": [], "do": [], "doc": [1, 4], "docartefact": [], "docstr": [], "doctr": 3, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [0, 4, 6], "documentbuild": [], "documentfil": 1, "doesn": [], "don": [], "done": [], "download": 0, "downsiz": [], "draw": 5, "drop": 0, "drop_last": 0, "dtype": 4, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [1, 3], "each": [0, 1, 2, 4, 5], "eas": [], "easi": [2, 6], "easier": 4, "easili": [1, 2, 4, 6], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [0, 2, 4], "either": 4, "element": [0, 1, 4], "els": [], "email": [], "empathi": [], "en": [], "enabl": 1, "enclos": 1, "encod": [0, 2, 4], "encode_sequ": 0, "encount": [], "encrypt": [], "end": [0, 2, 6], "english": [], "enough": 4, "ensur": [], "entir": 1, "entri": [], "environ": [], "eo": 0, "equiv": [], "error": 6, "estim": [], "etc": 1, "ethnic": [], "evalu": [0, 4], "event": [], "everyon": [], "everyth": [], "exact": 6, "exactmatch": 6, "exampl": [0, 1, 4, 5, 6], "exchang": [], "exclud": 4, "execut": [], "exist": [], "expand": [], "expect": [0, 1, 4, 5], "experi": 4, "explan": 4, "explicit": [], "exploit": [2, 4], "export": [1, 6], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 5, "extens": 1, "extern": [], "extract": [0, 2], "extract_arch": 0, "extractor": 4, "f_": 6, "f_a": 6, "factor": 5, "fair": [], "fairli": [], "fals": [0, 4, 5, 6], "faq": [], "fascan": [], "fast": [0, 2], "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [4, 6], "feed": 4, "feedback": [], "feel": [], "felix92": [], "few": [], "figsiz": 6, "figur": 6, "file": 0, "file_hash": 0, "file_nam": 0, "final": [], "find": [], "fine": 2, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 1, "flag": [], "flip": [], "float": [1, 5, 6], "float32": 4, "fn": 5, "focu": [], "focus": [], "folder": 4, "follow": [4, 5, 6], "font": [], "font_famili": [], "foral": 6, "forc": [], "forg": [], "form": [0, 2], "format": [1, 4], "forpost": [0, 2], "forum": [], "found": [], "fp": 4, "fp16": 4, "frac": 6, "frame": 4, "framework": 0, "free": [], "french": [0, 4], "friendli": 2, "from": [0, 1, 2, 4, 5, 6], "from_hub": [], "from_imag": 1, "from_pdf": 1, "from_url": 1, "full": [0, 4, 6], "fulli": 2, "function": [4, 5, 6], "funsd": [0, 2, 4], "further": [], "futur": [], "g": 1, "g_": 6, "g_x": 6, "gallagh": [], "gamma": 5, "gaussian": 5, "gaussianblur": [], "gaussiannois": [], "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 1, "geq": 6, "german": [], "get": 1, "get_artefact": 1, "get_word": 1, "gettextword": 1, "git": 2, "github": 3, "give": [], "given": [0, 4, 6], "global": [], "go": [], "good": [], "googl": [], "googlevis": 2, "gpu": 2, "gracefulli": [], "graph": 1, "grayscal": 5, "ground": 6, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "guid": [], "guidanc": [], "gvision": 4, "h": 1, "h_": 6, "ha": [0, 6], "half": 4, "handl": 0, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 6, "have": [0, 4, 6], "head": [], "healthi": [], "hebrew": [], "height": 1, "hello": 6, "help": [], "here": [0, 5], "hf": [], "hf_hub_download": [], "high": 1, "higher": [], "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 1, "hous": [], "how": [], "howev": [], "hsv": 5, "html": [], "http": [1, 3], "hub": [], "hue": 5, "huggingfac": [], "hw": [], "i": [0, 1, 2, 4, 5, 6], "i7": [], "ibrahimov": [], "ic03": [], "ic13": [], "icdar": [], "icdar2019": 0, "id": [], "ident": [], "identifi": [2, 4], "ignor": 6, "ignore_acc": 6, "ignore_cas": 6, "iiit": [], "iiit5k": [], "iiithw": [], "imag": [0, 1, 2, 4, 5, 6], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [0, 5], "img_cont": [], "img_fold": [], "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [0, 1, 4, 5, 6], "import": [0, 1, 4, 5, 6], "improv": [], "inappropri": [], "incid": [], "includ": 4, "inclus": [], "increas": 5, "independ": [], "index": 1, "indic": 6, "individu": [], "infer": 5, "inform": [0, 2, 4], "inherit": [0, 4], "input": [1, 4, 5], "input_crop": [], "input_pag": [4, 6], "input_shap": 4, "input_t": 4, "input_tensor": 4, "inspir": 5, "instal": 2, "instanc": 4, "instanti": 4, "instead": 1, "insult": [], "int": [0, 1, 4, 5, 6], "int64": [], "integ": 6, "integr": [], "intel": [], "interact": 6, "interfac": [], "interoper": [], "interpol": [4, 5], "interpret": [0, 1], "intersect": 6, "invert": 5, "investig": [], "invis": [], "involv": 4, "io": [], "iou": 6, "iou_thresh": 6, "iou_threshold": [], "irregular": [2, 4], "isn": 0, "issu": [], "italian": [], "iter": 0, "its": [0, 1], "itself": [], "j": 6, "jame": [], "job": [], "join": [], "jpeg": 5, "jpegqual": 5, "jpg": 1, "json": [], "json_output": [], "jump": [], "just": 4, "kei": [], "kera": 4, "kernel": [], "kernel_s": 4, "kernel_shap": [], "keywoard": [], "keyword": [0, 1], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [0, 1, 4, 6], "l": 6, "l_j": 6, "label": [], "label_fil": [], "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 5, "lambdatransform": 5, "lang": [], "languag": [1, 2], "larg": [], "largest": 6, "last": [0, 3, 4], "latenc": [], "later": [], "latest": [], "latin": 0, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 4, "least": [], "left": 6, "legacy_french": [], "length": 0, "less": [], "let": 4, "letter": 6, "level": [4, 6], "levenshtein": 6, "leverag": [], "lf": [], "librari": 3, "light": 2, "lightweight": [], "like": [], "limits_": 6, "line": [2, 6], "line_1_1": [], "link": [], "linknet": [2, 4], "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "list": [0, 1, 5], "ll": 6, "load": [2, 4], "load_state_dict": [], "load_weight": [], "loader": 0, "loc_pr": [], "local": [2, 4, 6], "localis": [], "localizationconfus": 6, "locat": [], "login": [], "login_to_hub": [], "logo": 1, "love": [], "lower": 5, "m": 6, "m1": [], "macbook": [], "machin": [], "made": 2, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 2, "mainten": [], "make": [4, 6], "mani": [], "manipul": [], "map": [0, 4], "map_loc": [], "master": [], "match": [2, 6], "mathcal": 6, "matplotlib": 6, "max": 6, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 5, "max_dist": 6, "max_gain": 5, "max_gamma": 5, "max_qual": 5, "max_ratio": [], "maximum": [0, 6], "maxval": [4, 5], "mbox": 6, "mean": [5, 6], "meaniou": 6, "meant": 1, "measur": 4, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 5, "metric": [4, 6], "middl": [], "might": 4, "min": [], "min_area": [], "min_char": [], "min_gain": 5, "min_gamma": 5, "min_qual": 5, "min_ratio": [], "min_val": 5, "minde": 3, "minim": [], "minimalist": [], "minimum": 6, "minval": 5, "miss": [], "mistak": [], "mix": 2, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 3, "model": [0, 6], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [1, 4, 5, 6], "more": [], "moscardi": [], "most": 4, "mozilla": [], "multi": [], "multilingu": [], "multipl": [0, 1, 5], "multipli": 5, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [0, 6], "na": 4, "name": [0, 4], "nation": [], "natur": 2, "ndarrai": [0, 1, 6], "necessari": [], "need": 6, "neg": 5, "nest": [], "nestedobject": 5, "netraj": [], "network": [2, 4], "neural": [2, 4], "new": [], "newer": 3, "next": 0, "nois": [], "noisi": [0, 2], "non": [1, 5, 6], "none": [0, 1], "normal": [4, 5], "norwegian": [], "note": [], "now": [], "np": [4, 6], "num_output_channel": [], "num_sampl": [], "number": [0, 5, 6], "numpi": [1, 4, 6], "o": [], "obb": [], "obj_detect": [], "object": 0, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [0, 2, 6], "ocr_carea": [], "ocr_db_crnn": 6, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 4, "ocrdataset": 0, "ocrmetr": 6, "ocrpredictor": 4, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 5, "onc": [0, 4], "one": [0, 4, 5], "oneof": 5, "ones": [], "onli": [5, 6], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [2, 4], "optim": 2, "option": [], "order": [0, 1, 4], "org": [], "organ": 1, "orient": 1, "orientationpredictor": [], "other": [], "otherwis": 6, "our": [2, 4], "out": [4, 5, 6], "outpout": [], "output": [1, 5], "output_s": [1, 5], "outsid": [], "over": 6, "overal": [], "overlai": [], "overview": [], "overwrit": 0, "overwritten": [], "own": 2, "p": 5, "packag": [2, 6], "pad": [0, 4, 5], "page": [4, 6], "page1": 1, "page2": 1, "page_1": [], "page_idx": 1, "page_orientation_predictor": [], "page_param": [], "pair": 6, "paper": [], "par_1_1": [], "paragraph": [], "paragraph_break": [], "parallel": [], "param": [4, 5], "paramet": [0, 1, 4, 5, 6], "pars": [0, 2], "parseq": [], "part": 5, "parti": [], "partial": [], "particip": [], "pass": [0, 4], "password": [], "patch": [], "path": [1, 4], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "patil": [], "pattern": [], "pdf": 1, "pdfpage": [], "peopl": [], "per": [4, 5], "perform": [1, 2, 4, 5, 6], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 1, "pick": 5, "pictur": 1, "pip": 3, "pipelin": [], "pixel": [1, 5], "platinum": 4, "pleas": [], "plot": [], "plt": 6, "plug": [], "plugin": [], "png": 1, "point": [], "polici": [], "polish": [], "polit": [], "polygon": [], "pool": [], "portugues": [], "posit": 6, "possibl": 6, "post": [], "postprocessor": 4, "potenti": 4, "power": 2, "ppageno": [], "pre": [], "precis": [4, 6], "pred": [], "pred_box": [], "pred_label": [], "predefin": 0, "predict": [1, 6], "predictor": [], "prefer": 0, "preinstal": [], "preprocessor": 4, "prerequisit": [], "present": 0, "preserv": 5, "preserve_aspect_ratio": 5, "pretrain": [2, 4, 6], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": [], "probabl": 5, "problem": [], "procedur": 5, "process": [1, 2], "processor": [], "produc": 4, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 0, "properti": 4, "provid": [2, 4], "public": 2, "publicli": [], "publish": [], "pull": [], "punctuat": 0, "pure": [], "purpos": 4, "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 6, "python": 2, "python3": [], "pytorch": [], "q": [], "qr": 1, "qr_code": [], "qualiti": 5, "quantiz": 4, "quantize_model": 4, "question": [], "quickli": 2, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [4, 6], "random": [4, 5, 6], "randomappli": 5, "randombright": 5, "randomcontrast": 5, "randomcrop": [], "randomgamma": 5, "randomhorizontalflip": [], "randomhu": 5, "randomjpegqu": 5, "randomli": 5, "randomres": [], "randomrot": [], "randomsatur": 5, "randomshadow": [], "rang": 5, "rassi": [], "ratio": 5, "raw": 1, "re": 0, "read": [2, 4], "read_html": 1, "read_img": 1, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 1, "readi": [], "real": [2, 4, 5], "realli": [], "reason": [], "rebuild": [], "rebuilt": [], "recal": [4, 6], "receipt": [0, 2], "reco_arch": 4, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 6, "recognition_predictor": 4, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 4, "rectangular": [], "recurr": 2, "reduc": 5, "refer": [], "regardless": [], "region": [], "regroup": 6, "regular": [], "reject": [], "rel": 1, "relat": [], "releas": 3, "relev": [], "religion": [], "relu": 4, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": 2, "repres": [1, 4], "represent": [2, 4], "request": [], "requir": [3, 5], "research": 2, "residu": [], "resiz": [4, 5], "resnet": 4, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 1, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [5, 6], "restrict": [], "result": 1, "return": [0, 1, 4], "reusabl": 4, "review": [], "rgb": [1, 5], "rgb_mode": [], "rgb_output": 1, "right": [4, 6], "roboflow": [], "robust": 2, "root": [], "rotat": 1, "run": [], "same": [1, 6], "sampl": 0, "sample_transform": 0, "sanjin": [], "sar": [2, 4], "sar_resnet31": 4, "sar_vgg16_bn": 4, "satur": 5, "save": [0, 4], "saved_model": 4, "scale": 6, "scale_rang": [], "scan": [0, 2], "scene": [2, 4], "scheme": 4, "score": [], "scratch": 2, "script": [], "seamless": 2, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 4, "section": [], "secur": [], "see": [], "seemlessli": 2, "seen": 4, "segment": [2, 4], "self": [], "semant": [2, 4], "send": [], "sens": 6, "sensit": [], "separ": 4, "sequenc": [0, 1, 2, 4, 6], "sequenti": [4, 5], "seri": [], "serial": 4, "serialized_model": 4, "seriou": [], "set": [0, 4, 6], "set_global_polici": [], "sever": [1, 5], "sex": [], "sexual": [], "sha256": 0, "shade": [], "shape": [1, 4, 5, 6], "share": [], "shift": 5, "shm": [], "should": [0, 1, 6], "show": [2, 4, 6], "showcas": [], "shuffl": 0, "side": 6, "signatur": 1, "signific": 0, "simpl": [2, 4], "simpler": [], "sinc": 0, "singl": [], "single_img_doc": [], "size": [0, 1, 4, 5], "skew": [], "slack": [], "slightli": [], "small": 2, "smallest": 1, "snapshot_download": [], "snippet": [], "so": [], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [0, 1, 4, 5, 6], "space": [], "span": [], "spanish": [], "spatial": 1, "special": 2, "specif": [0, 6], "specifi": 1, "speed": [2, 4], "sphinx": [], "sroie": 0, "stabl": 3, "stackoverflow": [], "stage": 2, "standalon": [], "standard": 5, "start": [], "state": 2, "static": [], "statist": 4, "statu": [], "std": 5, "step": [], "still": [], "str": [0, 1, 4, 5, 6], "straight": [], "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 1, "street": [], "strict": [], "strictli": 6, "string": [0, 1, 4], "strive": [], "strong": [2, 4], "structur": 4, "subset": [0, 4], "suggest": [], "sum": 6, "summari": 6, "support": 4, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": 4, "symmetr": [], "symmetric_pad": [], "synthet": [], "synthtext": [], "system": [], "t": 0, "tabl": [], "take": [], "target": [0, 1, 4, 5], "target_s": 0, "task": [0, 2, 4], "task2": [], "team": [], "techminde": [], "templat": 1, "tensor": [0, 4, 5], "tensorflow": [2, 4, 5], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": 1, "text_output": [], "textmatch": [], "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [2, 4], "textstylebrush": [], "textual": [0, 1, 2], "tf": [4, 5], "tf_model": 4, "tflite": 4, "than": 6, "thank": [], "thei": [], "them": 0, "thi": [2, 3, 4, 6], "thing": [], "third": [], "those": [1, 4], "threaten": [], "threshold": [], "through": [0, 5], "tilman": [], "time": [0, 2, 4, 6], "tini": [], "titl": 1, "tm": [], "tmp": [], "togeth": [1, 4], "tograi": 5, "tool": [], "top": [], "topic": [], "torch": [], "torchvis": 5, "total": [], "toward": [], "train": [0, 4, 5], "train_it": 0, "train_load": 0, "train_pytorch": [], "train_set": 0, "train_tensorflow": [], "trainabl": [2, 4], "tranform": 5, "transcrib": [], "transfer": [], "transfo": 5, "transform": [0, 2], "translat": [], "troll": [], "true": [0, 1, 4, 5, 6], "truth": 6, "tune": 2, "tupl": [1, 4, 5], "turn": 4, "two": 1, "txt": [], "type": [1, 4], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [4, 6], "ukrainian": [], "unaccept": [], "underli": 0, "underneath": 1, "understand": [0, 2], "uniform": [4, 5], "uniformli": [], "uninterrupt": 1, "union": 6, "unit": [], "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 4, "updat": 6, "upgrad": [], "upper": 5, "uppercas": [], "url": [0, 1], "us": [0, 3, 6], "usabl": 4, "usag": 4, "use_polygon": [], "useabl": [], "user": [1, 2], "utf": [], "util": [2, 4], "v1": [], "v3": [], "valid": [], "valu": [1, 5], "valuabl": 2, "variabl": [], "varieti": [], "veri": 2, "verifi": 0, "verma": [], "version": 4, "vgg": 4, "vgg16": 4, "vgg16_bn_r": [], "via": 2, "video": [], "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 0, "visiontransform": [], "visual": [], "visualize_pag": 6, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": 4, "vocabulari": [], "w": [1, 6], "w3": [], "wa": [], "wai": [0, 2, 4], "want": [], "warm": 4, "warmup": [], "wasn": [], "we": [1, 2, 4, 5], "weasyprint": [], "web": 1, "websit": [], "welcom": [], "well": [], "were": 1, "what": [], "when": 6, "whenev": [], "where": [1, 6], "whether": [0, 1, 6], "which": 4, "whichev": [], "while": [], "why": [], "width": 1, "wiki": [], "wildreceipt": [], "window": 6, "wish": [], "within": [], "without": 4, "wonder": [], "word": [2, 4, 6], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 6, "work": [], "worker": 0, "workflow": [], "worklow": [], "world": 6, "worth": [], "wrap": [], "wrapper": [0, 5], "write": [], "written": 1, "www": 1, "x": [1, 5, 6], "x12larg": 4, "x_ascend": [], "x_descend": [], "x_i": 6, "x_size": [], "x_wconf": [], "xeon": 4, "xhtml": [], "xmax": 1, "xmin": 1, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 6, "y_i": 6, "y_j": 6, "yet": [], "ymax": 1, "ymin": 1, "yolov8": [], "you": 4, "your": [0, 1, 4, 6], "yoursit": 1, "yugesh": [], "zero": [4, 5], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 0, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": [], "01": [], "02": [], "03": [], "04": [], "05": [], "07": [], "08": [], "09": [], "1": [], "10": [], "11": [], "12": [], "18": [], "2": [], "2021": [], "2022": [], "2023": [], "2024": [], "21": [], "22": [], "27": [], "28": [], "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 4, "architectur": [], "arg": [], "artefact": 1, "artefactdetect": [], "attribut": [], "avail": 0, "aw": [], "ban": [], "block": 1, "bug": [], "build": 2, "changelog": [], "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 5, "compress": 4, "conda": [], "conduct": [], "connect": [], "content": 2, "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 0, "dataload": [], "dataset": [0, 2], "detect": [2, 4], "develop": [], "do": [], "doctr": [0, 1, 2, 4, 5, 6], "document": [1, 2], "end": 4, "enforc": [], "evalu": 6, "export": 4, "factori": [], "featur": 2, "feedback": [], "file": 1, "from": [], "gener": [], "get": 2, "git": 3, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": 2, "infer": [], "instal": 3, "integr": 2, "io": [], "lambda": [], "let": [], "line": 1, "linux": [], "load": 0, "loader": [], "main": [], "mode": [], "model": [2, 4], "modifi": [], "modul": [], "name": [], "notebook": [], "object": [], "ocr": 4, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": 4, "own": [], "packag": 3, "page": 1, "perman": [], "pipelin": [], "pledg": [], "post": 4, "pre": 4, "precis": [], "predictor": [2, 4], "prepar": [], "prerequisit": [], "pretrain": [], "process": 4, "push": [], "python": 3, "qualiti": [], "question": [], "read": 1, "readi": [], "recognit": [2, 4], "report": [], "request": [], "resourc": [], "respons": [], "return": [], "right": [], "savedmodel": 4, "scope": [], "share": [], "should": [], "stage": 4, "standard": [], "start": 2, "structur": 1, "style": [], "support": [0, 5], "synthet": [], "task": 6, "temporari": [], "test": [], "text": [2, 4], "train": 2, "transform": 5, "two": 4, "unit": [], "us": 4, "util": 6, "v0": [], "verif": [], "via": 3, "visual": 6, "vocab": 0, "warn": [], "what": [], "word": 1, "your": 2, "zoo": 4}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/v0.2.0/transforms.html b/v0.2.0/transforms.html deleted file mode 100644 index a79dd132cb..0000000000 --- a/v0.2.0/transforms.html +++ /dev/null @@ -1,678 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.6)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[NestedObject])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[NestedObject])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: NestedObject, p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.0/using_doctr/custom_models_training.html b/v0.2.0/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/v0.2.0/using_doctr/custom_models_training.html +++ b/v0.2.0/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/v0.2.0/using_doctr/running_on_aws.html b/v0.2.0/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/v0.2.0/using_doctr/running_on_aws.html +++ b/v0.2.0/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/v0.2.0/using_doctr/sharing_models.html b/v0.2.0/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/v0.2.0/using_doctr/sharing_models.html +++ b/v0.2.0/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/v0.2.0/using_doctr/using_contrib_modules.html b/v0.2.0/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.2.0/using_doctr/using_contrib_modules.html +++ b/v0.2.0/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.2.0/using_doctr/using_datasets.html b/v0.2.0/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/v0.2.0/using_doctr/using_datasets.html +++ b/v0.2.0/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/v0.2.0/using_doctr/using_model_export.html b/v0.2.0/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/v0.2.0/using_doctr/using_model_export.html +++ b/v0.2.0/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/v0.2.0/using_doctr/using_models.html b/v0.2.0/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/v0.2.0/using_doctr/using_models.html +++ b/v0.2.0/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/v0.2.0/utils.html b/v0.2.0/utils.html deleted file mode 100644 index baa1f29d3b..0000000000 --- a/v0.2.0/utils.html +++ /dev/null @@ -1,534 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.ExactMatch(ignore_case: bool = False, ignore_accents: bool = False)[source]
-

Implements exact match metric (word-level accuracy) for recognition task.

-

The aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -ExactMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import ExactMatch
->>> metric = ExactMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-
    -
  • ignore_case – if true, ignore letter case when computing metric

  • -
  • ignore_accents – if true, ignore accents errors when computing metrics

  • -
-
-
-
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5)[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, max_dist: int = 0)[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-
    -
  • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

  • -
  • max_dist – maximum Levenshtein distance between 2 sequence to consider a match

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.2.1/_modules/doctr/datasets/cord.html b/v0.2.1/_modules/doctr/datasets/cord.html index a750524015..55b0584830 100644 --- a/v0.2.1/_modules/doctr/datasets/cord.html +++ b/v0.2.1/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.cord

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
-import tensorflow as tf
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
-from .core import VisionDataset
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['CORD']
+__all__ = ["CORD"]
 
 
 
-[docs] +[docs] class CORD(VisionDataset): """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" <https://openreview.net/pdf?id=SJl3z659UH>`_. - Example:: - >>> from doctr.datasets import CORD - >>> train_set = CORD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0 + :align: center + + >>> from doctr.datasets import CORD + >>> train_set = CORD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip', - '45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_test.zip', - '8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0", + "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8", + "cord_train.zip", + ) + + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0", + "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58", + "cord_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - - # # List images - self.root = os.path.join(self._root, 'image') - self.data: List[Tuple[str, Dict[str, Any]]] = [] + # List images + tmp_root = os.path.join(self.root, "image") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] self.train = train - self.sample_transforms = sample_transforms - for img_path in os.listdir(self.root): - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + np_dtype = np.float32 + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))): + # File existence check + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem _targets = [] - with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f: label = json.load(f) for line in label["valid_line"]: for word in line["words"]: - x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] - y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] - # Reduce 8 coords to 4 - left, right = min(x), max(x) - top, bot = min(y), max(y) if len(word["text"]) > 0: - _targets.append((word["text"], [left, top, right, bot])) + x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] + y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] + box: Union[List[float], np.ndarray] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + box = np.array( + [ + [x[0], y[0]], + [x[1], y[1]], + [x[2], y[2]], + [x[3], y[3]], + ], + dtype=np_dtype, + ) + else: + # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax + box = [min(x), min(y), max(x), max(y)] + _targets.append((word["text"], box)) text_targets, box_targets = zip(*_targets) - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.int), labels=text_targets))) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) + ) + for crop, label in zip(crops, list(text_targets)): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -384,8 +461,8 @@

Source code for doctr.datasets.cord

       
     
   
-
- + + diff --git a/v0.2.1/_modules/doctr/datasets/core.html b/v0.2.1/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.2.1/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/_modules/doctr/datasets/detection.html b/v0.2.1/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/v0.2.1/_modules/doctr/datasets/detection.html +++ b/v0.2.1/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.2.1/_modules/doctr/datasets/doc_artefacts.html b/v0.2.1/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/v0.2.1/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.2.1/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.2.1/_modules/doctr/datasets/funsd.html b/v0.2.1/_modules/doctr/datasets/funsd.html index 670e036f7f..f08612f9fa 100644 --- a/v0.2.1/_modules/doctr/datasets/funsd.html +++ b/v0.2.1/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.funsd

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
-import tensorflow as tf
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
-from .core import VisionDataset
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['FUNSD']
+__all__ = ["FUNSD"]
 
 
 
-[docs] +[docs] class FUNSD(VisionDataset): """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. - Example:: - >>> from doctr.datasets import FUNSD - >>> train_set = FUNSD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0 + :align: center + + >>> from doctr.datasets import FUNSD + >>> train_set = FUNSD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip' - SHA256 = 'c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f' - FILE_NAME = 'funsd.zip' + URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip" + SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f" + FILE_NAME = "funsd.zip" def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + super().__init__( + self.URL, + self.FILE_NAME, + self.SHA256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - super().__init__(self.URL, self.FILE_NAME, self.SHA256, True, **kwargs) self.train = train - self.sample_transforms = sample_transforms + np_dtype = np.float32 # Use the subset - subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data') + subfolder = os.path.join("dataset", "training_data" if train else "testing_data") # # List images - self.root = os.path.join(self._root, subfolder, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + tmp_root = os.path.join(self.root, subfolder, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))): + # File existence check + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f: data = json.load(f) - _targets = [(word['text'], word['box']) for block in data['form'] - for word in block['words'] if len(word['text']) > 0] - + _targets = [ + (word["text"], word["box"]) + for block in data["form"] + for word in block["words"] + if len(word["text"]) > 0 + ] text_targets, box_targets = zip(*_targets) + if use_polygons: + # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners + box_targets = [ # type: ignore[assignment] + [ + [box[0], box[1]], + [box[2], box[1]], + [box[2], box[3]], + [box[0], box[3]], + ] + for box in box_targets + ] + + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype) + ) + for crop, label in zip(crops, list(text_targets)): + # filter labels with unknown characters + if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)), + )) - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.int), labels=text_targets))) + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -378,8 +453,8 @@

Source code for doctr.datasets.funsd

       
     
   
-
- + + diff --git a/v0.2.1/_modules/doctr/datasets/generator/tensorflow.html b/v0.2.1/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/v0.2.1/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.2.1/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/datasets/ic03.html b/v0.2.1/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/v0.2.1/_modules/doctr/datasets/ic03.html +++ b/v0.2.1/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.2.1/_modules/doctr/datasets/ic13.html b/v0.2.1/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/v0.2.1/_modules/doctr/datasets/ic13.html +++ b/v0.2.1/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.2.1/_modules/doctr/datasets/iiit5k.html b/v0.2.1/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/v0.2.1/_modules/doctr/datasets/iiit5k.html +++ b/v0.2.1/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.2.1/_modules/doctr/datasets/iiithws.html b/v0.2.1/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/v0.2.1/_modules/doctr/datasets/iiithws.html +++ b/v0.2.1/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.2.1/_modules/doctr/datasets/imgur5k.html b/v0.2.1/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/v0.2.1/_modules/doctr/datasets/imgur5k.html +++ b/v0.2.1/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.2.1/_modules/doctr/datasets/loader.html b/v0.2.1/_modules/doctr/datasets/loader.html index 9e0c1c25b4..ed80350ef0 100644 --- a/v0.2.1/_modules/doctr/datasets/loader.html +++ b/v0.2.1/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.loader

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import math
-import tensorflow as tf
-import numpy as np
-from typing import Dict, Any, Optional
+from typing import Callable, Optional
 
-from .multithreading import multithread_exec
+import numpy as np
+import tensorflow as tf
 
 __all__ = ["DataLoader"]
 
@@ -293,12 +314,13 @@ 

Source code for doctr.datasets.loader

     """Collate multiple elements into batches
 
     Args:
+    ----
         samples: list of N tuples containing M elements
 
     Returns:
+    -------
         Tuple of M sequences contianing N elements each
     """
-
     batch_data = zip(*samples)
 
     tf_data = tuple(tf.stack(elt, axis=0) for elt in batch_data)
@@ -307,23 +329,23 @@ 

Source code for doctr.datasets.loader

 
 
 
-[docs] +[docs] class DataLoader: """Implements a dataset wrapper for fast data loading - Example:: - >>> from doctr.datasets import FUNSD, DataLoader - >>> train_set = CORD(train=True, download=True) - >>> train_loader = DataLoader(train_set, batch_size=32) - >>> train_iter = iter(train_loader) - >>> images, targets = next(train_iter) + >>> from doctr.datasets import CORD, DataLoader + >>> train_set = CORD(train=True, download=True) + >>> train_loader = DataLoader(train_set, batch_size=32) + >>> train_iter = iter(train_loader) + >>> images, targets = next(train_iter) Args: + ---- dataset: the dataset shuffle: whether the samples should be shuffled before passing it to the iterator batch_size: number of elements in each batch drop_last: if `True`, drops the last batch if it isn't full - workers: number of workers to use for data loading + collate_fn: function to merge samples into a batch """ def __init__( @@ -332,17 +354,22 @@

Source code for doctr.datasets.loader

         shuffle: bool = True,
         batch_size: int = 1,
         drop_last: bool = False,
-        workers: Optional[int] = None,
+        collate_fn: Optional[Callable] = None,
     ) -> None:
         self.dataset = dataset
         self.shuffle = shuffle
         self.batch_size = batch_size
         nb = len(self.dataset) / batch_size
         self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
-        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
-        self.workers = workers
+        if collate_fn is None:
+            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
+        else:
+            self.collate_fn = collate_fn
         self.reset()
 
+    def __len__(self) -> int:
+        return self.num_batches
+
     def reset(self) -> None:
         # Updates indices after each epoch
         self._num_yielded = 0
@@ -358,9 +385,9 @@ 

Source code for doctr.datasets.loader

         if self._num_yielded < self.num_batches:
             # Get next indices
             idx = self._num_yielded * self.batch_size
-            indices = self.indices[idx: min(len(self.dataset), idx + self.batch_size)]
+            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
 
-            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.workers)
+            samples = list(map(self.dataset.__getitem__, indices))
 
             batch_data = self.collate_fn(samples)
 
@@ -401,8 +428,8 @@ 

Source code for doctr.datasets.loader

       
     
   
-
- +
+ diff --git a/v0.2.1/_modules/doctr/datasets/mjsynth.html b/v0.2.1/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/v0.2.1/_modules/doctr/datasets/mjsynth.html +++ b/v0.2.1/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.2.1/_modules/doctr/datasets/ocr.html b/v0.2.1/_modules/doctr/datasets/ocr.html index 4ad72c3663..ce1ed8b0d4 100644 --- a/v0.2.1/_modules/doctr/datasets/ocr.html +++ b/v0.2.1/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.ocr

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
-import tensorflow as tf
+from typing import Any, Dict, List, Tuple
 
-from .core import AbstractDataset
+import numpy as np
 
+from .datasets import AbstractDataset
 
-__all__ = ['OCRDataset']
+__all__ = ["OCRDataset"]
 
 
 
-[docs] +[docs] class OCRDataset(AbstractDataset): """Implements an OCR dataset + >>> from doctr.datasets import OCRDataset + >>> train_set = OCRDataset(img_folder="/path/to/images", + >>> label_file="/path/to/labels.json") + >>> img, target = train_set[0] + Args: + ---- img_folder: local path to image folder (all jpg at the root) label_file: local path to the label file - sample_transforms: composable transformations that will be applied to each image - **kwargs: keyword arguments from `VisionDataset`. + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + **kwargs: keyword arguments from `AbstractDataset`. """ def __init__( self, img_folder: str, label_file: str, - sample_transforms: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + use_polygons: bool = False, **kwargs: Any, ) -> None: - - self.sample_transforms = sample_transforms - self.root = img_folder + super().__init__(img_folder, **kwargs) # List images self.data: List[Tuple[str, Dict[str, Any]]] = [] - with open(label_file, 'rb') as f: + np_dtype = np.float32 + with open(label_file, "rb") as f: data = json.load(f) - for file_dic in data: + for img_name, annotations in data.items(): # Get image path - img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg' + img_name = Path(img_name) + # File existence check if not os.path.exists(os.path.join(self.root, img_name)): raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") # handle empty images - if (len(file_dic["coordinates"]) == 0 or - (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")): - self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[]))) + if len(annotations["typed_words"]) == 0: + self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[]))) continue - is_valid: List[bool] = [] - box_targets: List[List[float]] = [] - for box in file_dic["coordinates"]: - xs, ys = zip(*box) - box = [min(xs), min(ys), max(xs), max(ys)] - if box[0] < box[2] and box[1] < box[3]: - box_targets.append(box) - is_valid.append(True) - else: - is_valid.append(False) + # Unpack the straight boxes (xmin, ymin, xmax, ymax) + geoms = [list(map(float, obj["geometry"][:4])) for obj in annotations["typed_words"]] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + geoms = [ + [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]] # type: ignore[list-item] + for geom in geoms + ] + + text_targets = [obj["value"] for obj in annotations["typed_words"]] - text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid] - self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
+ self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))
@@ -377,8 +402,8 @@

Source code for doctr.datasets.ocr

       
     
   
- - + + diff --git a/v0.2.1/_modules/doctr/datasets/recognition.html b/v0.2.1/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/v0.2.1/_modules/doctr/datasets/recognition.html +++ b/v0.2.1/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.2.1/_modules/doctr/datasets/sroie.html b/v0.2.1/_modules/doctr/datasets/sroie.html index 656697b3b9..04cf10bda2 100644 --- a/v0.2.1/_modules/doctr/datasets/sroie.html +++ b/v0.2.1/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.sroie

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import csv
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
-import tensorflow as tf
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
-from .core import VisionDataset
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['SROIE']
+__all__ = ["SROIE"]
 
 
 
-[docs] +[docs] class SROIE(VisionDataset): """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction" <https://arxiv.org/pdf/2103.10213.pdf>`_. - Example:: - >>> from doctr.datasets import SROIE - >>> train_set = SROIE(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0 + :align: center + + >>> from doctr.datasets import SROIE + >>> train_set = SROIE(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_train_task1.zip', - 'd4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_test.zip', - '41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0", + "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f", + "sroie2019_train_task1.zip", + ) + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0", + "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2", + "sroie2019_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[tf.Tensor], tf.Tensor]] = None, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - self.sample_transforms = sample_transforms self.train = train - # # List images - self.root = os.path.join(self._root, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + tmp_root = os.path.join(self.root, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + np_dtype = np.float32 + + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))): + # File existence check + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - _targets = [] - with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f: - for row in csv.reader(f, delimiter=','): - # Safeguard for blank lines - if len(row) > 0: - # Label may contain commas - label = ",".join(row[8:]) - # Reduce 8 coords to 4 - p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x, p4_y = map(int, row[:8]) - left, right = min(p1_x, p2_x, p3_x, p4_x), max(p1_x, p2_x, p3_x, p4_x) - top, bot = min(p1_y, p2_y, p3_y, p4_y), max(p1_y, p2_y, p3_y, p4_y) - if len(label) > 0: - _targets.append((label, [left, top, right, bot])) - - text_targets, box_targets = zip(*_targets) - - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets))) + with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f: + _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0] + + labels = [",".join(row[8:]) for row in _rows] + # reorder coordinates (8 -> (4,2) -> + # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines + coords: np.ndarray = np.stack( + [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0 + ) + + if not use_polygons: + # xmin, ymin, xmax, ymax + coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1) + + if recognition_task: + crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords) + for crop, label in zip(crops, labels): + if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0: + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, coords)) + else: + self.data.append((img_path, dict(boxes=coords, labels=labels))) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -385,8 +444,8 @@

Source code for doctr.datasets.sroie

       
     
   
-
- + + diff --git a/v0.2.1/_modules/doctr/datasets/svhn.html b/v0.2.1/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/v0.2.1/_modules/doctr/datasets/svhn.html +++ b/v0.2.1/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.2.1/_modules/doctr/datasets/svt.html b/v0.2.1/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/v0.2.1/_modules/doctr/datasets/svt.html +++ b/v0.2.1/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.2.1/_modules/doctr/datasets/synthtext.html b/v0.2.1/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/v0.2.1/_modules/doctr/datasets/synthtext.html +++ b/v0.2.1/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.2.1/_modules/doctr/datasets/utils.html b/v0.2.1/_modules/doctr/datasets/utils.html index aedf276e89..bde9304597 100644 --- a/v0.2.1/_modules/doctr/datasets/utils.html +++ b/v0.2.1/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.datasets.utils

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import string
 import unicodedata
+from collections.abc import Sequence
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Sequence as SequenceType
+
 import numpy as np
-from typing import List, Optional, Any
+from PIL import Image
+
+from doctr.io.image import get_img_shape
+from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
 
 from .vocabs import VOCABS
 
-__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
+__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences", "pre_transform_multiclass"]
+
+ImageTensor = TypeVar("ImageTensor")
 
 
 def translate(
     input_string: str,
     vocab_name: str,
-    unknown_char: str = '■',
+    unknown_char: str = "■",
 ) -> str:
     """Translate a string input in a given vocabulary
 
     Args:
+    ----
         input_string: input string to translate
         vocab_name: vocabulary to use (french, latin, ...)
         unknown_char: unknown character for non-translatable characters
 
     Returns:
-        A string translated in a given vocab"""
-
+    -------
+        A string translated in a given vocab
+    """
     if VOCABS.get(vocab_name) is None:
         raise KeyError("output vocabulary must be in vocabs dictionnary")
 
-    translated = ''
+    translated = ""
     for char in input_string:
         if char not in VOCABS[vocab_name]:
             # we need to translate char into a vocab char
@@ -310,85 +350,177 @@ 

Source code for doctr.datasets.utils

                 # remove whitespaces
                 continue
             # normalize character if it is not in vocab
-            char = unicodedata.normalize('NFD', char).encode('ascii', 'ignore').decode('ascii')
-            if char == '' or char not in VOCABS[vocab_name]:
+            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
+            if char == "" or char not in VOCABS[vocab_name]:
                 # if normalization fails or char still not in vocab, return unknown character)
                 char = unknown_char
         translated += char
     return translated
 
 
-def encode_sequence(
+def encode_string(
     input_string: str,
     vocab: str,
-) -> List[str]:
+) -> List[int]:
     """Given a predefined mapping, encode the string to a sequence of numbers
 
     Args:
+    ----
         input_string: string to encode
         vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A list encoding the input_string"""
-
-    return list(map(vocab.index, input_string))
+    -------
+        A list encoding the input_string
+    """
+    try:
+        return list(map(vocab.index, input_string))
+    except ValueError:
+        raise ValueError(
+            f"some characters cannot be found in 'vocab'. \
+                         Please check the input string {input_string} and the vocabulary {vocab}"
+        )
 
 
 def decode_sequence(
-    input_array: np.array,
+    input_seq: Union[np.ndarray, SequenceType[int]],
     mapping: str,
 ) -> str:
     """Given a predefined mapping, decode the sequence of numbers to a string
 
     Args:
-        input_array: array to decode
+    ----
+        input_seq: array to decode
         mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A string, decoded from input_array"""
-
-    if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
+    -------
+        A string, decoded from input_seq
+    """
+    if not isinstance(input_seq, (Sequence, np.ndarray)):
+        raise TypeError("Invalid sequence type")
+    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
         raise AssertionError("Input must be an array of int, with max less than mapping size")
-    decoded = ''.join(mapping[idx] for idx in input_array)
-    return decoded
+
+    return "".join(map(mapping.__getitem__, input_seq))
 
 
 
-[docs] +[docs] def encode_sequences( sequences: List[str], vocab: str, target_size: Optional[int] = None, eos: int = -1, - **kwargs: Any, + sos: Optional[int] = None, + pad: Optional[int] = None, + dynamic_seq_length: bool = False, ) -> np.ndarray: """Encode character sequences using a given vocab as mapping Args: + ---- sequences: the list of character sequences of size N vocab: the ordered vocab to use for encoding target_size: maximum length of the encoded data eos: encoding of End Of String + sos: optional encoding of Start Of String + pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD + dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size Returns: + ------- the padded encoded data as a tensor """ - if 0 <= eos < len(vocab): raise ValueError("argument 'eos' needs to be outside of vocab possible indices") - if not isinstance(target_size, int): - target_size = max(len(w) for w in sequences) + if not isinstance(target_size, int) or dynamic_seq_length: + # Maximum string length + EOS + max_length = max(len(w) for w in sequences) + 1 + if isinstance(sos, int): + max_length += 1 + if isinstance(pad, int): + max_length += 1 + target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size) # Pad all sequences - encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32) - - for idx, seq in enumerate(sequences): - encoded_seq = encode_sequence(seq, vocab) - encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)] + if isinstance(pad, int): # pad with padding symbol + if 0 <= pad < len(vocab): + raise ValueError("argument 'pad' needs to be outside of vocab possible indices") + # In that case, add EOS at the end of the word before padding + default_symbol = pad + else: # pad with eos symbol + default_symbol = eos + encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32) + + # Encode the strings + for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)): + if isinstance(pad, int): # add eos at the end of the sequence + seq.append(eos) + encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)] + + if isinstance(sos, int): # place sos symbol at the beginning of each sequence + if 0 <= sos < len(vocab): + raise ValueError("argument 'sos' needs to be outside of vocab possible indices") + encoded_data = np.roll(encoded_data, 1) + encoded_data[:, 0] = sos return encoded_data
+ + +def convert_target_to_relative( + img: ImageTensor, target: Union[np.ndarray, Dict[str, Any]] +) -> Tuple[ImageTensor, Union[Dict[str, Any], np.ndarray]]: + if isinstance(target, np.ndarray): + target = convert_to_relative_coords(target, get_img_shape(img)) + else: + target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img)) + return img, target + + +def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]: + """Crop a set of bounding boxes from an image + + Args: + ---- + img_path: path to the image + geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4) + + Returns: + ------- + a list of cropped images + """ + with Image.open(img_path) as pil_img: + img: np.ndarray = np.array(pil_img.convert("RGB")) + # Polygon + if geoms.ndim == 3 and geoms.shape[1:] == (4, 2): + return extract_rcrops(img, geoms.astype(dtype=int)) + if geoms.ndim == 2 and geoms.shape[1] == 4: + return extract_crops(img, geoms.astype(dtype=int)) + raise ValueError("Invalid geometry format") + + +def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.ndarray, Dict[str, List]]: + """Converts multiclass target to relative coordinates. + + Args: + ---- + img: Image + target: tuple of target polygons and their classes names + + Returns: + ------- + Image and dictionary of boxes, with class names as keys + """ + boxes = convert_to_relative_coords(target[0], get_img_shape(img)) + boxes_classes = target[1] + boxes_dict: Dict = {k: [] for k in sorted(set(boxes_classes))} + for k, poly in zip(boxes_classes, boxes): + boxes_dict[k].append(poly) + boxes_dict = {k: np.stack(v, axis=0) for k, v in boxes_dict.items()} + return img, boxes_dict
@@ -421,8 +553,8 @@

Source code for doctr.datasets.utils

       
     
   
- - + + diff --git a/v0.2.1/_modules/doctr/datasets/wildreceipt.html b/v0.2.1/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.2.1/_modules/doctr/datasets/wildreceipt.html +++ b/v0.2.1/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.2.1/_modules/doctr/documents/elements.html b/v0.2.1/_modules/doctr/documents/elements.html deleted file mode 100644 index e4e7bb08a6..0000000000 --- a/v0.2.1/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,571 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional
-
-from doctr.utils.geometry import resolve_enclosing_bbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[BoundingBox] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - geometry = resolve_enclosing_bbox([w.geometry for w in words]) - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[BoundingBox] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - geometry = resolve_enclosing_bbox(line_boxes + artefact_boxes) - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show(self, page: np.ndarray, interactive: bool = True, **kwargs) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/_modules/doctr/documents/reader.html b/v0.2.1/_modules/doctr/documents/reader.html deleted file mode 100644 index 44de246dc8..0000000000 --- a/v0.2.1/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,611 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/_modules/doctr/io/elements.html b/v0.2.1/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/v0.2.1/_modules/doctr/io/elements.html +++ b/v0.2.1/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.2.1/_modules/doctr/io/html.html b/v0.2.1/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/v0.2.1/_modules/doctr/io/html.html +++ b/v0.2.1/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.2.1/_modules/doctr/io/image/base.html b/v0.2.1/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/v0.2.1/_modules/doctr/io/image/base.html +++ b/v0.2.1/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.2.1/_modules/doctr/io/image/tensorflow.html b/v0.2.1/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/v0.2.1/_modules/doctr/io/image/tensorflow.html +++ b/v0.2.1/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.2.1/_modules/doctr/io/pdf.html b/v0.2.1/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/v0.2.1/_modules/doctr/io/pdf.html +++ b/v0.2.1/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.2.1/_modules/doctr/io/reader.html b/v0.2.1/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/v0.2.1/_modules/doctr/io/reader.html +++ b/v0.2.1/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.2.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/v0.2.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/v0.2.1/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/v0.2.1/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.2.1/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/v0.2.1/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/classification/vit/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/v0.2.1/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/classification/zoo.html b/v0.2.1/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/v0.2.1/_modules/doctr/models/classification/zoo.html +++ b/v0.2.1/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.2.1/_modules/doctr/models/detection/differentiable_binarization.html b/v0.2.1/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.2.1/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.2.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 4325d0b74a..66cef8663d 100644 --- a/v0.2.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -759,7 +759,7 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo

- + diff --git a/v0.2.1/_modules/doctr/models/detection/fast/tensorflow.html b/v0.2.1/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.2.1/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/detection/linknet.html b/v0.2.1/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.2.1/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.2.1/_modules/doctr/models/detection/linknet/tensorflow.html index dbb58e37cf..ce995f99d4 100644 --- a/v0.2.1/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -716,7 +716,7 @@

Source code for doctr.models.detection.linknet.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/detection/zoo.html b/v0.2.1/_modules/doctr/models/detection/zoo.html index b655cdfcea..3651c4e2d3 100644 --- a/v0.2.1/_modules/doctr/models/detection/zoo.html +++ b/v0.2.1/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
-from .core import DetectionPredictor
-from ..preprocessor import PreProcessor
-from .. import detection
+from doctr.file_utils import is_tf_available, is_torch_available
 
+from .. import detection
+from ..detection.fast import reparameterize
+from ..preprocessor import PreProcessor
+from .predictor import DetectionPredictor
 
 __all__ = ["detection_predictor"]
 
-ARCHS = ['db_resnet50', 'linknet']
+ARCHS: List[str]
 
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
+if is_tf_available():
+    ARCHS = [
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
+elif is_torch_available():
+    ARCHS = [
+        "db_resnet34",
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
 
-    # Detection
-    _model = detection.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 1)
+def _predictor(arch: Any, pretrained: bool, assume_straight_pages: bool = True, **kwargs: Any) -> DetectionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
+
+        _model = detection.__dict__[arch](
+            pretrained=pretrained,
+            pretrained_backbone=kwargs.get("pretrained_backbone", True),
+            assume_straight_pages=assume_straight_pages,
+        )
+        # Reparameterize FAST models by default to lower inference latency and memory usage
+        if isinstance(_model, detection.FAST):
+            _model = reparameterize(_model)
+    else:
+        if not isinstance(arch, (detection.DBNet, detection.LinkNet, detection.FAST)):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+
+        _model = arch
+        _model.assume_straight_pages = assume_straight_pages
+        _model.postprocessor.assume_straight_pages = assume_straight_pages
+
+    kwargs.pop("pretrained_backbone", None)
+
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 2)
     predictor = DetectionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
-        _model
+        PreProcessor(_model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:], **kwargs),
+        _model,
     )
     return predictor
 
 
 
-[docs] -def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: +[docs] +def detection_predictor( + arch: Any = "fast_base", + pretrained: bool = False, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + batch_size: int = 2, + **kwargs: Any, +) -> DetectionPredictor: """Text detection architecture. - Example:: - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> model = detection_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import detection_predictor + >>> model = detection_predictor(arch='db_resnet50', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_resnet50') + ---- + arch: name of the architecture or model itself to use (e.g. 'db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset + assume_straight_pages: If True, fit straight boxes to the page + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right + batch_size: number of samples the model processes in parallel + **kwargs: optional keyword arguments passed to the architecture Returns: + ------- Detection predictor """ - - return _predictor(arch, pretrained, **kwargs)
+ return _predictor( + arch=arch, + pretrained=pretrained, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + batch_size=batch_size, + **kwargs, + )
@@ -362,8 +449,8 @@

Source code for doctr.models.detection.zoo

       
     
   
- - + + diff --git a/v0.2.1/_modules/doctr/models/export.html b/v0.2.1/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.2.1/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/_modules/doctr/models/factory/hub.html b/v0.2.1/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/v0.2.1/_modules/doctr/models/factory/hub.html +++ b/v0.2.1/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.2.1/_modules/doctr/models/recognition/crnn.html b/v0.2.1/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.2.1/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.2.1/_modules/doctr/models/recognition/crnn/tensorflow.html index e50c245923..bc64da9a1b 100644 --- a/v0.2.1/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -658,7 +658,7 @@

Source code for doctr.models.recognition.crnn.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/recognition/master/tensorflow.html b/v0.2.1/_modules/doctr/models/recognition/master/tensorflow.html index 152ebb7e59..aa6aa69325 100644 --- a/v0.2.1/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -655,7 +655,7 @@

Source code for doctr.models.recognition.master.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.2.1/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/v0.2.1/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/recognition/sar.html b/v0.2.1/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.2.1/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.2.1/_modules/doctr/models/recognition/sar/tensorflow.html index 010bc2bc54..4a591e6451 100644 --- a/v0.2.1/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -757,7 +757,7 @@

Source code for doctr.models.recognition.sar.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.2.1/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/v0.2.1/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/models/recognition/zoo.html b/v0.2.1/_modules/doctr/models/recognition/zoo.html index 5b13575396..f664304019 100644 --- a/v0.2.1/_modules/doctr/models/recognition/zoo.html +++ b/v0.2.1/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
-from .core import RecognitionPredictor
-from ..preprocessor import PreProcessor
-from .. import recognition
+from doctr.file_utils import is_tf_available
+from doctr.models.preprocessor import PreProcessor
 
+from .. import recognition
+from .predictor import RecognitionPredictor
 
 __all__ = ["recognition_predictor"]
 
-ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
+
+ARCHS: List[str] = [
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "sar_resnet31",
+    "master",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
+]
 
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+        _model = recognition.__dict__[arch](
+            pretrained=pretrained, pretrained_backbone=kwargs.get("pretrained_backbone", True)
+        )
+    else:
+        if not isinstance(
+            arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq)
+        ):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+        _model = arch
 
-    _model = recognition.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 32)
-    predictor = RecognitionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
-        _model
-    )
+    kwargs.pop("pretrained_backbone", None)
+
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 128)
+    input_shape = _model.cfg["input_shape"][:2] if is_tf_available() else _model.cfg["input_shape"][-2:]
+    predictor = RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs), _model)
 
     return predictor
 
 
 
-[docs] -def recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor: +[docs] +def recognition_predictor( + arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + symmetric_pad: bool = False, + batch_size: int = 128, + **kwargs: Any, +) -> RecognitionPredictor: """Text recognition architecture. Example:: @@ -321,14 +369,18 @@

Source code for doctr.models.recognition.zoo

        >>> out = model([input_page])
 
     Args:
-        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
+    ----
+        arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
         pretrained: If True, returns a model pre-trained on our text recognition dataset
+        symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right
+        batch_size: number of samples the model processes in parallel
+        **kwargs: optional parameters to be passed to the architecture
 
     Returns:
+    -------
         Recognition predictor
     """
-
-    return _predictor(arch, pretrained, **kwargs)
+ return _predictor(arch=arch, pretrained=pretrained, symmetric_pad=symmetric_pad, batch_size=batch_size, **kwargs)
@@ -362,8 +414,8 @@

Source code for doctr.models.recognition.zoo

   
-
- +
+ diff --git a/v0.2.1/_modules/doctr/models/zoo.html b/v0.2.1/_modules/doctr/models/zoo.html index dec6857019..d459671648 100644 --- a/v0.2.1/_modules/doctr/models/zoo.html +++ b/v0.2.1/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.models.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from typing import Any
-from .core import OCRPredictor
+
 from .detection.zoo import detection_predictor
+from .kie_predictor import KIEPredictor
+from .predictor import OCRPredictor
 from .recognition.zoo import recognition_predictor
 
+__all__ = ["ocr_predictor", "kie_predictor"]
 
-__all__ = ["ocr_predictor"]
-
-
-def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
 
+def _predictor(
+    det_arch: Any,
+    reco_arch: Any,
+    pretrained: bool,
+    pretrained_backbone: bool = True,
+    assume_straight_pages: bool = True,
+    preserve_aspect_ratio: bool = True,
+    symmetric_pad: bool = True,
+    det_bs: int = 2,
+    reco_bs: int = 128,
+    detect_orientation: bool = False,
+    straighten_pages: bool = False,
+    detect_language: bool = False,
+    **kwargs,
+) -> OCRPredictor:
     # Detection
-    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
+    det_predictor = detection_predictor(
+        det_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=det_bs,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+    )
 
     # Recognition
-    reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
+    reco_predictor = recognition_predictor(
+        reco_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=reco_bs,
+    )
 
-    return OCRPredictor(det_predictor, reco_predictor)
+    return OCRPredictor(
+        det_predictor,
+        reco_predictor,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+        detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
+        detect_language=detect_language,
+        **kwargs,
+    )
 
 
 
-[docs] +[docs] def ocr_predictor( - det_arch: str = 'db_resnet50', - reco_arch: str = 'crnn_vgg16_bn', + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", pretrained: bool = False, - **kwargs: Any + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. - Example:: - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet') + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` Returns: + ------- OCR predictor """ + return _predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
+ + - return _predictor(det_arch, reco_arch, pretrained, **kwargs)
+def _kie_predictor( + det_arch: Any, + reco_arch: Any, + pretrained: bool, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + det_bs: int = 2, + reco_bs: int = 128, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs, +) -> KIEPredictor: + # Detection + det_predictor = detection_predictor( + det_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=det_bs, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + ) + + # Recognition + reco_predictor = recognition_predictor( + reco_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=reco_bs, + ) + + return KIEPredictor( + det_predictor, + reco_predictor, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + ) + + +
+[docs] +def kie_predictor( + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, +) -> KIEPredictor: + """End-to-end KIE architecture using one model for localization, and another for text recognition. + + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) + + Args: + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') + pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` + + Returns: + ------- + KIE predictor + """ + return _kie_predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
@@ -353,8 +575,8 @@

Source code for doctr.models.zoo

       
     
   
- - + + diff --git a/v0.2.1/_modules/doctr/transforms/modules.html b/v0.2.1/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.2.1/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/_modules/doctr/transforms/modules/base.html b/v0.2.1/_modules/doctr/transforms/modules/base.html index 96ebd680b7..4596df3848 100644 --- a/v0.2.1/_modules/doctr/transforms/modules/base.html +++ b/v0.2.1/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -643,7 +643,7 @@

Source code for doctr.transforms.modules.base

- + diff --git a/v0.2.1/_modules/doctr/transforms/modules/tensorflow.html b/v0.2.1/_modules/doctr/transforms/modules/tensorflow.html index 0e18bcc922..acbbe96225 100644 --- a/v0.2.1/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.2.1/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -956,7 +956,7 @@

Source code for doctr.transforms.modules.tensorflow

- + diff --git a/v0.2.1/_modules/doctr/utils/metrics.html b/v0.2.1/_modules/doctr/utils/metrics.html index f0ed19117b..8a37d5949a 100644 --- a/v0.2.1/_modules/doctr/utils/metrics.html +++ b/v0.2.1/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.metrics

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-from typing import List, Tuple, Dict
-from unidecode import unidecode
+from anyascii import anyascii
 from scipy.optimize import linear_sum_assignment
+from shapely.geometry import Polygon
 
-__all__ = ['TextMatch', 'box_iou', 'LocalizationConfusion', 'OCRMetric']
+__all__ = [
+    "TextMatch",
+    "box_iou",
+    "polygon_iou",
+    "nms",
+    "LocalizationConfusion",
+    "OCRMetric",
+    "DetectionMetric",
+]
 
 
 def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
-    """Perform string comparison with multiple levels of tolerance
+    """Performs string comparison with multiple levels of tolerance
 
     Args:
+    ----
         word1: a string
         word2: another string
 
     Returns:
+    -------
         a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
-            unidecode counterparts and their lower-case unidecode counterparts match
+            anyascii counterparts and their lower-case anyascii counterparts match
     """
-    raw_match = (word1 == word2)
-    caseless_match = (word1.lower() == word2.lower())
-    unidecode_match = (unidecode(word1) == unidecode(word2))
+    raw_match = word1 == word2
+    caseless_match = word1.lower() == word2.lower()
+    anyascii_match = anyascii(word1) == anyascii(word2)
 
     # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
-    unicase_match = (unidecode(word1).lower() == unidecode(word2).lower())
+    unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
 
-    return raw_match, caseless_match, unidecode_match, unicase_match
+    return raw_match, caseless_match, anyascii_match, unicase_match
 
 
 
-[docs] +[docs] class TextMatch: - """Implements text match metric (word-level accuracy) for recognition task. + r"""Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \\forall X, Y \\in \\mathcal{W}^N, - TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) + \forall X, Y \in \mathcal{W}^N, + TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \\forall a, x \\in \\mathcal{W}, - f_a(x) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } x = a \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{W}` is the set of all possible character sequences, + \forall a, x \in \mathcal{W}, + f_a(x) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } x = a \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. - Example:: - >>> from doctr.utils import TextMatch - >>> metric = TextMatch() - >>> metric.update(['Hello', 'world'], ['hello', 'world']) - >>> metric.summary() + >>> from doctr.utils import TextMatch + >>> metric = TextMatch() + >>> metric.update(['Hello', 'world'], ['hello', 'world']) + >>> metric.summary() """ def __init__(self) -> None: self.reset() +
+[docs] def update( self, gt: List[str], @@ -351,29 +386,32 @@

Source code for doctr.utils.metrics

         """Update the state of the metric with new predictions
 
         Args:
+        ----
             gt: list of groung-truth character sequences
-            pred: list of predicted character sequences"""
-
+            pred: list of predicted character sequences
+        """
         if len(gt) != len(pred):
             raise AssertionError("prediction size does not match with ground-truth labels size")
 
         for gt_word, pred_word in zip(gt, pred):
-            _raw, _caseless, _unidecode, _unicase = string_match(gt_word, pred_word)
+            _raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
             self.raw += int(_raw)
             self.caseless += int(_caseless)
-            self.unidecode += int(_unidecode)
+            self.anyascii += int(_anyascii)
             self.unicase += int(_unicase)
 
-        self.total += len(gt)
+        self.total += len(gt)
+
-[docs] +[docs] def summary(self) -> Dict[str, float]: """Computes the aggregated metrics - Returns: - a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode - counterpart and its lower-case unidecode counterpart + Returns + ------- + a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii + counterpart and its lower-case anyascii counterpart """ if self.total == 0: raise AssertionError("you need to update the metric before getting the summary") @@ -381,7 +419,7 @@

Source code for doctr.utils.metrics

         return dict(
             raw=self.raw / self.total,
             caseless=self.caseless / self.total,
-            unidecode=self.unidecode / self.total,
+            anyascii=self.anyascii / self.total,
             unicase=self.unicase / self.total,
         )
@@ -389,24 +427,25 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.raw = 0
         self.caseless = 0
-        self.unidecode = 0
+        self.anyascii = 0
         self.unicase = 0
         self.total = 0
def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray: - """Compute the IoU between two sets of bounding boxes + """Computes the IoU between two sets of bounding boxes Args: + ---- boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax) boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax) Returns: + ------- the IoU matrix of shape (N, M) """ - - iou_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) + iou_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0: l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1) @@ -417,62 +456,150 @@

Source code for doctr.utils.metrics

         right = np.minimum(r1, r2.T)
         bot = np.minimum(b1, b2.T)
 
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
+        intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
         union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
         iou_mat = intersection / union
 
     return iou_mat
 
 
+def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
+    """Computes the IoU between two sets of rotated bounding boxes
+
+    Args:
+    ----
+        polys_1: rotated bounding boxes of shape (N, 4, 2)
+        polys_2: rotated bounding boxes of shape (M, 4, 2)
+        mask_shape: spatial shape of the intermediate masks
+        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
+
+    Returns:
+    -------
+        the IoU matrix of shape (N, M)
+    """
+    if polys_1.ndim != 3 or polys_2.ndim != 3:
+        raise AssertionError("expects boxes to be in format (N, 4, 2)")
+
+    iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
+
+    shapely_polys_1 = [Polygon(poly) for poly in polys_1]
+    shapely_polys_2 = [Polygon(poly) for poly in polys_2]
+
+    for i, poly1 in enumerate(shapely_polys_1):
+        for j, poly2 in enumerate(shapely_polys_2):
+            intersection_area = poly1.intersection(poly2).area
+            union_area = poly1.area + poly2.area - intersection_area
+            iou_mat[i, j] = intersection_area / union_area
+
+    return iou_mat
+
+
+def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
+    """Perform non-max suppression, borrowed from <https://github.com/rbgirshick/fast-rcnn>`_.
+
+    Args:
+    ----
+        boxes: np array of straight boxes: (*, 5), (xmin, ymin, xmax, ymax, score)
+        thresh: iou threshold to perform box suppression.
+
+    Returns:
+    -------
+        A list of box indexes to keep
+    """
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    scores = boxes[:, 4]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+    return keep
+
+
 
-[docs] +[docs] class LocalizationConfusion: - """Implements common confusion metrics and mean IoU for localization evaluation. + r"""Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ - Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) + \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ + Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ + Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ + meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \\forall y \\in \\mathcal{B}, - g_X(y) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, + \forall y \in \mathcal{B}, + g_X(y) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import LocalizationConfusion - >>> metric = LocalizationConfusion(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import LocalizationConfusion + >>> metric = LocalizationConfusion(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ - def __init__(self, iou_thresh: float = 0.5) -> None: + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: self.iou_thresh = iou_thresh + self.use_polygons = use_polygons self.reset() +
+[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: + """Updates the metric + Args: + ---- + gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + """ if preds.shape[0] > 0: # Compute IoU - iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=1).sum()) + if self.use_polygons: + iou_mat = polygon_iou(gts, preds) + else: + iou_mat = box_iou(gts, preds) + self.tot_iou += float(iou_mat.max(axis=0).sum()) # Assign pairs gt_indices, pred_indices = linear_sum_assignment(-iou_mat) @@ -480,17 +607,18 @@

Source code for doctr.utils.metrics

 
         # Update counts
         self.num_gts += gts.shape[0]
-        self.num_preds += preds.shape[0]
+        self.num_preds += preds.shape[0]
+
-[docs] - def summary(self) -> Tuple[float, float, float]: +[docs] + def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: """Computes the aggregated metrics - Returns: + Returns + ------- a tuple with the recall, precision and meanIoU scores """ - # Recall recall = self.matches / self.num_gts if self.num_gts > 0 else None @@ -498,7 +626,7 @@

Source code for doctr.utils.metrics

         precision = self.matches / self.num_preds if self.num_preds > 0 else None
 
         # mean IoU
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -507,57 +635,65 @@

Source code for doctr.utils.metrics

         self.num_gts = 0
         self.num_preds = 0
         self.matches = 0
-        self.tot_iou = 0.
+ self.tot_iou = 0.0
-[docs] +[docs] class OCRMetric: - """Implements end-to-end OCR metric. + r"""Implements an end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, - \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ - Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) + \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, + \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ + Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, - h_{B,L}(b, l) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ - & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, - :math:`\\mathcal{L}` is the set of possible character sequences, + \forall (b, l) \in \mathcal{B} \times \mathcal{L}, + h_{B,L}(b, l) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import OCRMetric - >>> metric = OCRMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - ['hello'], ['hello', 'world']) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import OCRMetric + >>> metric = OCRMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> ['hello'], ['hello', 'world']) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ - def __init__(self, iou_thresh: float = 0.5) -> None: + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: self.iou_thresh = iou_thresh + self.use_polygons = use_polygons self.reset() +
+[docs] def update( self, gt_boxes: np.ndarray, @@ -565,44 +701,58 @@

Source code for doctr.utils.metrics

         gt_labels: List[str],
         pred_labels: List[str],
     ) -> None:
+        """Updates the metric
 
+        Args:
+        ----
+            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
+            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
+            gt_labels: a list of N string labels
+            pred_labels: a list of M string labels
+        """
         if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
-            raise AssertionError("there should be the same number of boxes and string both for the ground truth "
-                                 "and the predictions")
+            raise AssertionError(
+                "there should be the same number of boxes and string both for the ground truth and the predictions"
+            )
 
         # Compute IoU
         if pred_boxes.shape[0] > 0:
-            iou_mat = box_iou(gt_boxes, pred_boxes)
-            self.tot_iou += float(iou_mat.max(axis=1).sum())
+            if self.use_polygons:
+                iou_mat = polygon_iou(gt_boxes, pred_boxes)
+            else:
+                iou_mat = box_iou(gt_boxes, pred_boxes)
+
+            self.tot_iou += float(iou_mat.max(axis=0).sum())
 
             # Assign pairs
             gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
             is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
             # String comparison
             for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
-                _raw, _caseless, _unidecode, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
+                _raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
                 self.raw_matches += int(_raw)
                 self.caseless_matches += int(_caseless)
-                self.unidecode_matches += int(_unidecode)
+                self.anyascii_matches += int(_anyascii)
                 self.unicase_matches += int(_unicase)
 
         self.num_gts += gt_boxes.shape[0]
-        self.num_preds += pred_boxes.shape[0]
+        self.num_preds += pred_boxes.shape[0]
+
-[docs] - def summary(self) -> Tuple[Dict[str, float], Dict[str, float], float]: +[docs] + def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float]], Optional[float]]: """Computes the aggregated metrics - Returns: - a tuple with the recall & precision for each string comparison flexibility and the mean IoU + Returns + ------- + a tuple with the recall & precision for each string comparison and the mean IoU """ - # Recall recall = dict( raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None, caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None, - unidecode=self.unidecode_matches / self.num_gts if self.num_gts > 0 else None, + anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None, unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None, ) @@ -610,12 +760,12 @@

Source code for doctr.utils.metrics

         precision = dict(
             raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
             caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
-            unidecode=self.unidecode_matches / self.num_preds if self.num_preds > 0 else None,
+            anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
             unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
         )
 
         # mean IoU (overall detected boxes)
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -623,12 +773,136 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.num_gts = 0
         self.num_preds = 0
-        self.tot_iou = 0.
+        self.tot_iou = 0.0
         self.raw_matches = 0
         self.caseless_matches = 0
-        self.unidecode_matches = 0
+        self.anyascii_matches = 0
         self.unicase_matches = 0
+ + +
+[docs] +class DetectionMetric: + r"""Implements an object detection metric. + + The aggregated metrics are computed as follows: + + .. math:: + \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, + \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ + Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + + with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and + :math:`y`, and the function :math:`h_{B, C}` defined as: + + .. math:: + \forall (b, c) \in \mathcal{B} \times \mathcal{C}, + h_{B,C}(b, c) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{C}` is the set of possible class indices, + :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. + + >>> import numpy as np + >>> from doctr.utils import DetectionMetric + >>> metric = DetectionMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) + >>> metric.summary() + + Args: + ---- + iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format + """ + + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: + self.iou_thresh = iou_thresh + self.use_polygons = use_polygons + self.reset() + +
+[docs] + def update( + self, + gt_boxes: np.ndarray, + pred_boxes: np.ndarray, + gt_labels: np.ndarray, + pred_labels: np.ndarray, + ) -> None: + """Updates the metric + + Args: + ---- + gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + gt_labels: an array of class indices of shape (N,) + pred_labels: an array of class indices of shape (M,) + """ + if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: + raise AssertionError( + "there should be the same number of boxes and string both for the ground truth and the predictions" + ) + + # Compute IoU + if pred_boxes.shape[0] > 0: + if self.use_polygons: + iou_mat = polygon_iou(gt_boxes, pred_boxes) + else: + iou_mat = box_iou(gt_boxes, pred_boxes) + + self.tot_iou += float(iou_mat.max(axis=0).sum()) + + # Assign pairs + gt_indices, pred_indices = linear_sum_assignment(-iou_mat) + is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh + # Category comparison + self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) + + self.num_gts += gt_boxes.shape[0] + self.num_preds += pred_boxes.shape[0]
+ + +
+[docs] + def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: + """Computes the aggregated metrics + + Returns + ------- + a tuple with the recall & precision for each class prediction and the mean IoU + """ + # Recall + recall = self.num_matches / self.num_gts if self.num_gts > 0 else None + + # Precision + precision = self.num_matches / self.num_preds if self.num_preds > 0 else None + + # mean IoU (overall detected boxes) + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None + + return recall, precision, mean_iou
+ + + def reset(self) -> None: + self.num_gts = 0 + self.num_preds = 0 + self.tot_iou = 0.0 + self.num_matches = 0
+
@@ -661,8 +935,8 @@

Source code for doctr.utils.metrics

       
     
   
- - + + diff --git a/v0.2.1/_modules/doctr/utils/visualization.html b/v0.2.1/_modules/doctr/utils/visualization.html index 75fce020ad..c818be6d7b 100644 --- a/v0.2.1/_modules/doctr/utils/visualization.html +++ b/v0.2.1/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.visualization

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import colorsys
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import matplotlib.pyplot as plt
-from matplotlib.figure import Figure
+import cv2
 import matplotlib.patches as patches
-import mplcursors
+import matplotlib.pyplot as plt
 import numpy as np
-from typing import Tuple, List, Dict, Any
+from matplotlib.figure import Figure
 
-from .common_types import BoundingBox
+from .common_types import BoundingBox, Polygon4P
 
-__all__ = ['visualize_page']
+__all__ = ["visualize_page", "visualize_kie_page", "draw_boxes"]
 
 
-def create_rect_patch(
+def rect_patch(
     geometry: BoundingBox,
-    label: str,
     page_dimensions: Tuple[int, int],
-    color: Tuple[int, int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
     alpha: float = 0.3,
     linewidth: int = 2,
     fill: bool = True,
-) -> patches.Patch:
-    """Create a matplotlib patch (rectangle) bounding the element
+    preserve_aspect_ratio: bool = False,
+) -> patches.Rectangle:
+    """Create a matplotlib rectangular patch for the element
 
     Args:
+    ----
         geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
         label: label to display when hovered
-        page_dimensions: dimensions of the Page
         color: color to draw box
         alpha: opacity parameter to fill the boxes, 0 = transparent
         linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
 
     Returns:
+    -------
         a rectangular Patch
     """
-    h, w = page_dimensions
+    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
+    height, width = page_dimensions
     (xmin, ymin), (xmax, ymax) = geometry
-    xmin, xmax = xmin * w, xmax * w
-    ymin, ymax = ymin * h, ymax * h
-    rect = patches.Rectangle(
+    # Switch to absolute coords
+    if preserve_aspect_ratio:
+        width = height = max(height, width)
+    xmin, w = xmin * width, (xmax - xmin) * width
+    ymin, h = ymin * height, (ymax - ymin) * height
+
+    return patches.Rectangle(
         (xmin, ymin),
-        xmax - xmin,
-        ymax - ymin,
+        w,
+        h,
         fill=fill,
         linewidth=linewidth,
         edgecolor=(*color, alpha),
         facecolor=(*color, alpha),
-        label=label
+        label=label,
     )
-    return rect
+
+
+def polygon_patch(
+    geometry: np.ndarray,
+    page_dimensions: Tuple[int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
+    alpha: float = 0.3,
+    linewidth: int = 2,
+    fill: bool = True,
+    preserve_aspect_ratio: bool = False,
+) -> patches.Polygon:
+    """Create a matplotlib polygon patch for the element
+
+    Args:
+    ----
+        geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
+        label: label to display when hovered
+        color: color to draw box
+        alpha: opacity parameter to fill the boxes, 0 = transparent
+        linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
+
+    Returns:
+    -------
+        a polygon Patch
+    """
+    if not geometry.shape == (4, 2):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
+    height, width = page_dimensions
+    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
+    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
+
+    return patches.Polygon(
+        geometry,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def create_obj_patch(
+    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
+    page_dimensions: Tuple[int, int],
+    **kwargs: Any,
+) -> patches.Patch:
+    """Create a matplotlib patch for the element
+
+    Args:
+    ----
+        geometry: bounding box (straight or rotated) of the element
+        page_dimensions: dimensions of the page in format (height, width)
+        **kwargs: keyword arguments for the patch
+
+    Returns:
+    -------
+        a matplotlib Patch
+    """
+    if isinstance(geometry, tuple):
+        if len(geometry) == 2:  # straight word BB (2 pts)
+            return rect_patch(geometry, page_dimensions, **kwargs)
+        elif len(geometry) == 4:  # rotated word BB (4 pts)
+            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)
+    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
+        return polygon_patch(geometry, page_dimensions, **kwargs)
+    raise ValueError("invalid geometry format")
+
+
+def get_colors(num_colors: int) -> List[Tuple[float, float, float]]:
+    """Generate num_colors color for matplotlib
+
+    Args:
+    ----
+        num_colors: number of colors to generate
+
+    Returns:
+    -------
+        colors: list of generated colors
+    """
+    colors = []
+    for i in np.arange(0.0, 360.0, 360.0 / num_colors):
+        hue = i / 360.0
+        lightness = (50 + np.random.rand() * 10) / 100.0
+        saturation = (90 + np.random.rand() * 10) / 100.0
+        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
+    return colors
 
 
 
-[docs] +[docs] def visualize_page( page: Dict[str, Any], image: np.ndarray, @@ -344,18 +472,18 @@

Source code for doctr.utils.visualization

 ) -> Figure:
     """Visualize a full page with predicted blocks, lines and words
 
-    Example::
-        >>> import numpy as np
-        >>> import matplotlib.pyplot as plt
-        >>> from doctr.utils.visualization import visualize_page
-        >>> from doctr.models import ocr_db_crnn
-        >>> model = ocr_db_crnn(pretrained=True)
-        >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
-        >>> out = model([[input_page]])
-        >>> visualize_page(out[0].pages[0].export(), input_page)
-        >>> plt.show()
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from doctr.utils.visualization import visualize_page
+    >>> from doctr.models import ocr_db_crnn
+    >>> model = ocr_db_crnn(pretrained=True)
+    >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
+    >>> out = model([[input_page]])
+    >>> visualize_page(out[0].pages[0].export(), input_page)
+    >>> plt.show()
 
     Args:
+    ----
         page: the exported Page of a Document
         image: np array of the page, needs to have the same shape than page['dimensions']
         words_only: whether only words should be displayed
@@ -363,6 +491,11 @@ 

Source code for doctr.utils.visualization

         scale: figsize of the largest windows side
         interactive: whether the plot should be interactive
         add_labels: for static plot, adds text labels on top of bounding box
+        **kwargs: keyword arguments for the polygon patch
+
+    Returns:
+    -------
+        the matplotlib figure
     """
     # Get proper scale and aspect ratio
     h, w = image.shape[:2]
@@ -371,58 +504,189 @@ 

Source code for doctr.utils.visualization

     # Display the image
     ax.imshow(image)
     # hide both axis
-    ax.axis('off')
+    ax.axis("off")
 
     if interactive:
         artists: List[patches.Patch] = []  # instantiate an empty list of patches (to be drawn on the page)
 
-    for block in page['blocks']:
+    for block in page["blocks"]:
         if not words_only:
-            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
+            rect = create_obj_patch(
+                block["geometry"], page["dimensions"], label="block", color=(0, 1, 0), linewidth=1, **kwargs
+            )
             # add patch on figure
             ax.add_patch(rect)
             if interactive:
                 # add patch to cursor's artists
                 artists.append(rect)
 
-        for line in block['lines']:
+        for line in block["lines"]:
             if not words_only:
-                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
+                rect = create_obj_patch(
+                    line["geometry"], page["dimensions"], label="line", color=(1, 0, 0), linewidth=1, **kwargs
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
-            for word in line['words']:
-                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
-                                         page['dimensions'], (0, 0, 1), **kwargs)
+            for word in line["words"]:
+                rect = create_obj_patch(
+                    word["geometry"],
+                    page["dimensions"],
+                    label=f"{word['value']} (confidence: {word['confidence']:.2%})",
+                    color=(0, 0, 1),
+                    **kwargs,
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
                 elif add_labels:
-                    ax.text(
-                        int(page['dimensions'][1] * word['geometry'][0][0]),
-                        int(page['dimensions'][0] * word['geometry'][0][1]),
-                        word['value'],
-                        size=10,
-                        alpha=0.5,
-                        color=(0, 0, 1),
-                    )
+                    if len(word["geometry"]) == 5:
+                        text_loc = (
+                            int(page["dimensions"][1] * (word["geometry"][0] - word["geometry"][2] / 2)),
+                            int(page["dimensions"][0] * (word["geometry"][1] - word["geometry"][3] / 2)),
+                        )
+                    else:
+                        text_loc = (
+                            int(page["dimensions"][1] * word["geometry"][0][0]),
+                            int(page["dimensions"][0] * word["geometry"][0][1]),
+                        )
+
+                    if len(word["geometry"]) == 2:
+                        # We draw only if boxes are in straight format
+                        ax.text(
+                            *text_loc,
+                            word["value"],
+                            size=10,
+                            alpha=0.5,
+                            color=(0, 0, 1),
+                        )
 
         if display_artefacts:
-            for artefact in block['artefacts']:
-                rect = create_rect_patch(artefact['geometry'], 'artefact', page['dimensions'], (0.5, 0.5, 0.5),
-                                         linewidth=1, **kwargs)
+            for artefact in block["artefacts"]:
+                rect = create_obj_patch(
+                    artefact["geometry"],
+                    page["dimensions"],
+                    label="artefact",
+                    color=(0.5, 0.5, 0.5),
+                    linewidth=1,
+                    **kwargs,
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
     if interactive:
+        import mplcursors
+
         # Create mlp Cursor to hover patches in artists
         mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
-    fig.tight_layout()
+    fig.tight_layout(pad=0.0)
 
     return fig
+ + +def visualize_kie_page( + page: Dict[str, Any], + image: np.ndarray, + words_only: bool = False, + display_artefacts: bool = True, + scale: float = 10, + interactive: bool = True, + add_labels: bool = True, + **kwargs: Any, +) -> Figure: + """Visualize a full page with predicted blocks, lines and words + + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from doctr.utils.visualization import visualize_page + >>> from doctr.models import ocr_db_crnn + >>> model = ocr_db_crnn(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([[input_page]]) + >>> visualize_kie_page(out[0].pages[0].export(), input_page) + >>> plt.show() + + Args: + ---- + page: the exported Page of a Document + image: np array of the page, needs to have the same shape than page['dimensions'] + words_only: whether only words should be displayed + display_artefacts: whether artefacts should be displayed + scale: figsize of the largest windows side + interactive: whether the plot should be interactive + add_labels: for static plot, adds text labels on top of bounding box + **kwargs: keyword arguments for the polygon patch + + Returns: + ------- + the matplotlib figure + """ + # Get proper scale and aspect ratio + h, w = image.shape[:2] + size = (scale * w / h, scale) if h > w else (scale, h / w * scale) + fig, ax = plt.subplots(figsize=size) + # Display the image + ax.imshow(image) + # hide both axis + ax.axis("off") + + if interactive: + artists: List[patches.Patch] = [] # instantiate an empty list of patches (to be drawn on the page) + + colors = {k: color for color, k in zip(get_colors(len(page["predictions"])), page["predictions"])} + for key, value in page["predictions"].items(): + for prediction in value: + if not words_only: + rect = create_obj_patch( + prediction["geometry"], + page["dimensions"], + label=f"{key} \n {prediction['value']} (confidence: {prediction['confidence']:.2%}", + color=colors[key], + linewidth=1, + **kwargs, + ) + # add patch on figure + ax.add_patch(rect) + if interactive: + # add patch to cursor's artists + artists.append(rect) + + if interactive: + import mplcursors + + # Create mlp Cursor to hover patches in artists + mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label())) + fig.tight_layout(pad=0.0) + + return fig + + +def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None: + """Draw an array of relative straight boxes on an image + + Args: + ---- + boxes: array of relative boxes, of shape (*, 4) + image: np array, float32 or uint8 + color: color to use for bounding box edges + **kwargs: keyword arguments from `matplotlib.pyplot.plot` + """ + h, w = image.shape[:2] + # Convert boxes to absolute coords + _boxes = deepcopy(boxes) + _boxes[:, [0, 2]] *= w + _boxes[:, [1, 3]] *= h + _boxes = _boxes.astype(np.int32) + for box in _boxes.tolist(): + xmin, ymin, xmax, ymax = box + image = cv2.rectangle( + image, (xmin, ymin), (xmax, ymax), color=color if isinstance(color, tuple) else (0, 0, 255), thickness=2 + ) + plt.imshow(image) + plt.plot(**kwargs)
@@ -455,8 +719,8 @@

Source code for doctr.utils.visualization

       
     
   
- - + + diff --git a/v0.2.1/_modules/index.html b/v0.2.1/_modules/index.html index b3ffa8c863..5793c44f20 100644 --- a/v0.2.1/_modules/index.html +++ b/v0.2.1/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -225,20 +225,42 @@ - - + + diff --git a/v0.2.1/_sources/changelog.rst.txt b/v0.2.1/_sources/changelog.rst.txt index 0370519549..35befe7b96 100644 --- a/v0.2.1/_sources/changelog.rst.txt +++ b/v0.2.1/_sources/changelog.rst.txt @@ -1,6 +1,58 @@ Changelog ========= +v0.10.0 (2024-10-21) +------------------- +Release note: `v0.10.0 `_ + +v0.9.0 (2024-08-08) +------------------- +Release note: `v0.9.0 `_ + +v0.8.1 (2024-03-04) +------------------- +Release note: `v0.8.1 `_ + +v0.8.0 (2024-02-28) +------------------- +Release note: `v0.8.0 `_ + +v0.7.0 (2023-09-09) +------------------- +Release note: `v0.7.0 `_ + +v0.6.0 (2022-09-29) +------------------- +Release note: `v0.6.0 `_ + +v0.5.1 (2022-03-22) +------------------- +Release note: `v0.5.1 `_ + +v0.5.0 (2021-12-31) +------------------- +Release note: `v0.5.0 `_ + +v0.4.1 (2021-11-22) +------------------- +Release note: `v0.4.1 `_ + +v0.4.0 (2021-10-01) +------------------- +Release note: `v0.4.0 `_ + +v0.3.1 (2021-08-27) +------------------- +Release note: `v0.3.1 `_ + +v0.3.0 (2021-07-02) +------------------- +Release note: `v0.3.0 `_ + +v0.2.1 (2021-05-28) +------------------- +Release note: `v0.2.1 `_ + v0.2.0 (2021-05-11) ------------------- Release note: `v0.2.0 `_ diff --git a/v0.2.1/_sources/datasets.rst.txt b/v0.2.1/_sources/datasets.rst.txt deleted file mode 100644 index 31a5663285..0000000000 --- a/v0.2.1/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.core.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.2.1/_sources/documents.rst.txt b/v0.2.1/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.2.1/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.2.1/_sources/getting_started/installing.rst.txt b/v0.2.1/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/v0.2.1/_sources/getting_started/installing.rst.txt +++ b/v0.2.1/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/v0.2.1/_sources/index.rst.txt b/v0.2.1/_sources/index.rst.txt index afe926c6df..53251db142 100644 --- a/v0.2.1/_sources/index.rst.txt +++ b/v0.2.1/_sources/index.rst.txt @@ -1,7 +1,8 @@ -DocTR: Document Text Recognition -================================ +******************************** +docTR: Document Text Recognition +******************************** -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -9,38 +10,29 @@ State-of-the-art Optical Character Recognition made seamless & accessible to any DocTR provides an easy and powerful way to extract valuable information from your documents: -* |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. +* |:receipt:| **for automation**: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. -Welcome to the documentation of `DocTR `_! - - Main Features ------------- * |:robot:| Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor -* |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract +* |:rocket:| State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, small dependencies -* |:tools:| Daily maintained -* |:factory:| Easy integration - +* |:bird:| Light package, minimal dependencies +* |:tools:| Actively maintained by Mindee +* |:factory:| Easy integration (available templates for browser demo & API deployment) -Getting Started ---------------- .. toctree:: :maxdepth: 2 + :caption: Getting started + :hidden: - installing - - -Build & train your predictor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) -* Fine-tune or train from scratch any detection or recognition model to specialize on your data + getting_started/installing + notebooks Model zoo @@ -48,35 +40,83 @@ Model zoo Text detection models """"""""""""""""""""" - * `DBNet `_ (Differentiable Binarization) - * `LinkNet `_ +* DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ +* LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ +* FAST from `"FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation" `_ Text recognition models """"""""""""""""""""""" - * `SAR `_ (Show, Attend and Read) - * `CRNN `_ (Convolutional Recurrent Neural Network) +* SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ +* CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ +* MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ +* ViTSTR from `"Vision Transformer for Fast and Efficient Scene Text Recognition" `_ +* PARSeq from `"Scene Text Recognition with Permuted Autoregressive Sequence Models" `_ Supported datasets ^^^^^^^^^^^^^^^^^^ - * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. - * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. - * SROIE from `ICDAR 2019 `_. +* FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. +* CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. +* SROIE from `ICDAR 2019 `_. +* IIIT-5k from `CVIT `_. +* Street View Text from `"End-to-End Scene Text Recognition" `_. +* SynthText from `Visual Geometry Group `_. +* SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. +* IC03 from `ICDAR 2003 `_. +* IC13 from `ICDAR 2013 `_. +* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. +* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" `_. +* IIITHWS from `"Generating Synthetic Data for Text Recognition" `_. +* WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" `_. .. toctree:: :maxdepth: 2 - :caption: Notes + :caption: Using docTR + :hidden: - changelog + using_doctr/using_models + using_doctr/using_datasets + using_doctr/using_contrib_modules + using_doctr/sharing_models + using_doctr/using_model_export + using_doctr/custom_models_training + using_doctr/running_on_aws + + +.. toctree:: + :maxdepth: 2 + :caption: Community + :hidden: + + community/resources .. toctree:: :maxdepth: 2 :caption: Package Reference + :hidden: - datasets - documents - models - transforms - utils + modules/contrib + modules/datasets + modules/io + modules/models + modules/transforms + modules/utils + + +.. toctree:: + :maxdepth: 2 + :caption: Contributing + :hidden: + + contributing/code_of_conduct + contributing/contributing + + +.. toctree:: + :maxdepth: 2 + :caption: Notes + :hidden: + + changelog diff --git a/v0.2.1/_sources/installing.rst.txt b/v0.2.1/_sources/installing.rst.txt deleted file mode 100644 index c8ea72a834..0000000000 --- a/v0.2.1/_sources/installing.rst.txt +++ /dev/null @@ -1,41 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.2.1/_sources/models.rst.txt b/v0.2.1/_sources/models.rst.txt deleted file mode 100644 index 9f2276e03f..0000000000 --- a/v0.2.1/_sources/models.rst.txt +++ /dev/null @@ -1,223 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet - - -Post-processing detections -^^^^^^^^^^^^^^^^^^^^^^^^^^ -The purpose of this block is to turn the model output (binary segmentation map for instance), into a set of bounding boxes. - - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 - -Post-processing outputs -^^^^^^^^^^^^^^^^^^^^^^^ -The purpose of this block is to turn the model output (symbol classification for the sequence), into a set of strings. - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 0.595 | 0.625 | | 0.753 | 0.700 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 0.640 | 0.533 | | 0.689 | 0.611 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **0.781** | **0.830** | | **0.875** | 0.660 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.2.1/_sources/transforms.rst.txt b/v0.2.1/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.2.1/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.2.1/_sources/utils.rst.txt b/v0.2.1/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.2.1/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.2.1/_static/basic.css b/v0.2.1/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.2.1/_static/basic.css +++ b/v0.2.1/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.2.1/_static/doctools.js b/v0.2.1/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.2.1/_static/doctools.js +++ b/v0.2.1/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.2.1/_static/documentation_options.js b/v0.2.1/_static/documentation_options.js index d266000142..4f656fdbea 100644 --- a/v0.2.1/_static/documentation_options.js +++ b/v0.2.1/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.2.1a0-git', + VERSION: '0.10.1a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.2.1/_static/language_data.js b/v0.2.1/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.2.1/_static/language_data.js +++ b/v0.2.1/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.2.1/_static/searchtools.js b/v0.2.1/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.2.1/_static/searchtools.js +++ b/v0.2.1/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.2.1/changelog.html b/v0.2.1/changelog.html index 5e11507468..fc45a50384 100644 --- a/v0.2.1/changelog.html +++ b/v0.2.1/changelog.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + Changelog - docTR documentation @@ -226,20 +226,42 @@ + diff --git a/v0.2.1/community/resources.html b/v0.2.1/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.2.1/community/resources.html +++ b/v0.2.1/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.2.1/contributing/code_of_conduct.html b/v0.2.1/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/v0.2.1/contributing/code_of_conduct.html +++ b/v0.2.1/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/v0.2.1/contributing/contributing.html b/v0.2.1/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/v0.2.1/contributing/contributing.html +++ b/v0.2.1/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/v0.2.1/datasets.html b/v0.2.1/datasets.html deleted file mode 100644 index 26c04f21ab..0000000000 --- a/v0.2.1/datasets.html +++ /dev/null @@ -1,585 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.core.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-

Implements an abstract dataset

-
-
Parameters:
-
    -
  • url – URL of the dataset

  • -
  • file_name – name of the file once downloaded

  • -
  • file_hash – expected SHA256 of the file

  • -
  • extract_archive – whether the downloaded file is an archive to be extracted

  • -
  • download – whether the dataset should be downloaded if not present on disk

  • -
  • overwrite – whether the archive should be re-extracted

  • -
-
-
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Tensor], Tensor] | None = None, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Tensor], Tensor] | None = None, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Tensor], Tensor] | None = None, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Tensor], Tensor] | None = None, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/documents.html b/v0.2.1/documents.html deleted file mode 100644 index e3925d4b59..0000000000 --- a/v0.2.1/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/genindex.html b/v0.2.1/genindex.html index 3a0f1cd884..21520455b4 100644 --- a/v0.2.1/genindex.html +++ b/v0.2.1/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -224,20 +224,42 @@

+
+

U

+ + +
+
+

V

@@ -551,7 +711,13 @@

V

W

+
@@ -589,8 +755,8 @@

W

- - + + diff --git a/v0.2.1/getting_started/installing.html b/v0.2.1/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/v0.2.1/getting_started/installing.html +++ b/v0.2.1/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/v0.2.1/index.html b/v0.2.1/index.html index 3a22825833..3a06afc6d9 100644 --- a/v0.2.1/index.html +++ b/v0.2.1/index.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + docTR documentation @@ -226,20 +226,42 @@
-

DocTR: Document Text Recognition

-

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2

+

docTR: Document Text Recognition

+

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch

https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png

DocTR provides an easy and powerful way to extract valuable information from your documents:

    -
  • 🧾 for automation: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • +
  • 🧾 for automation: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • 👩‍🔬 for research: quickly compare your own architectures speed & performances with state-of-art models on public datasets.

-

Welcome to the documentation of DocTR!

Main Features

  • 🤖 Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters

  • ⚡ User-friendly, 3 lines of code to load a document and extract text with a predictor

  • -
  • 🚀 State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract

  • +
  • 🚀 State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract

  • ⚡ Optimized for inference speed on both CPU & GPU

  • -
  • 🐦 Light package, small dependencies

  • -
  • 🛠️ Daily maintained

  • -
  • 🏭 Easy integration

  • +
  • 🐦 Light package, minimal dependencies

  • +
  • 🛠️ Actively maintained by Mindee

  • +
  • 🏭 Easy integration (available templates for browser demo & API deployment)

-
-
-

Getting Started

-
-

Build & train your predictor

-
    -
  • Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained)

  • -
  • Fine-tune or train from scratch any detection or recognition model to specialize on your data

  • -
-

Model zoo

Text detection models

-
-

Text recognition models

-
-

Supported datasets

-
-
+
+
+
+
+
@@ -404,7 +381,7 @@

Supported datasets - +
Next @@ -444,10 +421,8 @@

Supported datasets + diff --git a/v0.2.1/installing.html b/v0.2.1/installing.html deleted file mode 100644 index 7c8f802dee..0000000000 --- a/v0.2.1/installing.html +++ /dev/null @@ -1,390 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/models.html b/v0.2.1/models.html deleted file mode 100644 index e8cb1c9fd9..0000000000 --- a/v0.2.1/models.html +++ /dev/null @@ -1,989 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet
->>> model = linknet(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Post-processing detections

-

The purpose of this block is to turn the model output (binary segmentation map for instance), into a set of bounding boxes.

-
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Post-processing outputs

-

The purpose of this block is to turn the model output (symbol classification for the sequence), into a set of strings.

-
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

0.595

0.625

0.753

0.700

Gvision doc. text detection

0.640

0.533

0.689

0.611

AWS textract

0.781

0.830

0.875

0.660

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/modules/contrib.html b/v0.2.1/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.2.1/modules/contrib.html +++ b/v0.2.1/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.2.1/modules/datasets.html b/v0.2.1/modules/datasets.html index 456e10b172..380a986793 100644 --- a/v0.2.1/modules/datasets.html +++ b/v0.2.1/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1081,7 +1081,7 @@

Returns:

- + diff --git a/v0.2.1/modules/io.html b/v0.2.1/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/v0.2.1/modules/io.html +++ b/v0.2.1/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/v0.2.1/modules/models.html b/v0.2.1/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/v0.2.1/modules/models.html +++ b/v0.2.1/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/v0.2.1/modules/transforms.html b/v0.2.1/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/v0.2.1/modules/transforms.html +++ b/v0.2.1/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/v0.2.1/modules/utils.html b/v0.2.1/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/v0.2.1/modules/utils.html +++ b/v0.2.1/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/v0.2.1/notebooks.html b/v0.2.1/notebooks.html index f97771aebb..d36539f59e 100644 --- a/v0.2.1/notebooks.html +++ b/v0.2.1/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/v0.2.1/objects.inv b/v0.2.1/objects.inv index 839fd8a5bb..c1700f291b 100644 Binary files a/v0.2.1/objects.inv and b/v0.2.1/objects.inv differ diff --git a/v0.2.1/py-modindex.html b/v0.2.1/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.2.1/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.2.1/search.html b/v0.2.1/search.html index fef17f06e7..d050f5eac7 100644 --- a/v0.2.1/search.html +++ b/v0.2.1/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -226,20 +226,42 @@ - - + + diff --git a/v0.2.1/searchindex.js b/v0.2.1/searchindex.js index f4ed29ff32..6f154115ab 100644 --- a/v0.2.1/searchindex.js +++ b/v0.2.1/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Post-processing detections": [[5, "post-processing-detections"]], "Post-processing outputs": [[5, "post-processing-outputs"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.core)": [[1, "doctr.datasets.core.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.core": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": [], "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": [], "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": 5, "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": [], "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "2023": [], "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": [], "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": [], "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": 5, "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": 5, "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": [], "611": 5, "62": [], "625": 5, "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": 5, "641": [], "647": [], "65": 5, "66": 5, "660": 5, "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": 5, "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": 5, "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": 5, "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": 5, "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": 5, "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": 5, "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": 5, "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "andrej": [], "angl": 2, "ani": [1, 2, 3, 5, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": 1, "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "articl": [], "artifici": [], "arxiv": [], "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": 6, "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "baranovskij": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": [2, 5], "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [2, 5, 6, 7], "box": [2, 5, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": 7, "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "christian": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": 5, "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "cool": [], "coordin": 2, "cord": [1, 3, 5], "core": [1, 7], "corner": [], "correct": 6, "correspond": 5, "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": 1, "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": [], "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [1, 2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "found": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gallagh": [], "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ibrahimov": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": 2, "insult": [], "int": [1, 2, 5, 6], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "jame": [], "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": [], "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": [1, 5], "map_loc": [], "master": [], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "moscardi": [], "most": 5, "mozilla": [], "multi": [], "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": 6, "netraj": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 6, 7], "none": [1, 2], "normal": [5, 6], "norwegian": [], "note": 0, "now": [], "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": [1, 5], "one": [1, 5, 6], "oneof": 6, "ones": [], "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": [], "order": [1, 2, 5], "org": [], "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": [], "par_1_1": [], "paragraph": [], "paragraph_break": [], "parallel": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "patil": [], "pattern": [], "pdf": 2, "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": [], "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": [], "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": 1, "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": 5, "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": 1, "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "realli": [], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": [], "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "roboflow": [], "robust": 3, "root": 1, "rotat": 2, "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sanjin": [], "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": 5, "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": 1, "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standalon": [], "standard": 6, "start": [], "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": [], "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": 5, "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": 5, "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unit": [], "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "verma": [], "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "video": [], "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": [], "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "yugesh": [], "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "21": [], "22": [], "27": [], "28": [], "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": 5, "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": 5, "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "resourc": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/v0.2.1/transforms.html b/v0.2.1/transforms.html deleted file mode 100644 index 7f48d92039..0000000000 --- a/v0.2.1/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.6)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[NestedObject])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[NestedObject])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: NestedObject, p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.2.1/using_doctr/custom_models_training.html b/v0.2.1/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/v0.2.1/using_doctr/custom_models_training.html +++ b/v0.2.1/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/v0.2.1/using_doctr/running_on_aws.html b/v0.2.1/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/v0.2.1/using_doctr/running_on_aws.html +++ b/v0.2.1/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/v0.2.1/using_doctr/sharing_models.html b/v0.2.1/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/v0.2.1/using_doctr/sharing_models.html +++ b/v0.2.1/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/v0.2.1/using_doctr/using_contrib_modules.html b/v0.2.1/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.2.1/using_doctr/using_contrib_modules.html +++ b/v0.2.1/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.2.1/using_doctr/using_datasets.html b/v0.2.1/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/v0.2.1/using_doctr/using_datasets.html +++ b/v0.2.1/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/v0.2.1/using_doctr/using_model_export.html b/v0.2.1/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/v0.2.1/using_doctr/using_model_export.html +++ b/v0.2.1/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/v0.2.1/using_doctr/using_models.html b/v0.2.1/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/v0.2.1/using_doctr/using_models.html +++ b/v0.2.1/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/v0.2.1/utils.html b/v0.2.1/utils.html deleted file mode 100644 index 5630a0b847..0000000000 --- a/v0.2.1/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5)[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float, float, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5)[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float], Dict[str, float], float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.3.0/_modules/doctr/datasets/cord.html b/v0.3.0/_modules/doctr/datasets/cord.html index f98ee6901c..55b0584830 100644 --- a/v0.3.0/_modules/doctr/datasets/cord.html +++ b/v0.3.0/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.cord

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
-from doctr.utils.geometry import fit_rbbox
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['CORD']
+__all__ = ["CORD"]
 
 
 
-[docs] +[docs] class CORD(VisionDataset): """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" <https://openreview.net/pdf?id=SJl3z659UH>`_. - Example:: - >>> from doctr.datasets import CORD - >>> train_set = CORD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0 + :align: center + + >>> from doctr.datasets import CORD + >>> train_set = CORD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip', - '45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_test.zip', - '8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0", + "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8", + "cord_train.zip", + ) + + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0", + "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58", + "cord_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - - # # List images - self.root = os.path.join(self._root, 'image') - self.data: List[Tuple[str, Dict[str, Any]]] = [] + # List images + tmp_root = os.path.join(self.root, "image") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] self.train = train - self.sample_transforms = sample_transforms - for img_path in os.listdir(self.root): + np_dtype = np.float32 + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem _targets = [] - with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f: label = json.load(f) for line in label["valid_line"]: for word in line["words"]: if len(word["text"]) > 0: x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] - if rotated_bbox: - box = list(fit_rbbox(np.array([ - [x[0], y[0]], - [x[1], y[1]], - [x[2], y[2]], - [x[3], y[3]], - ], dtype=np.float32))) + box: Union[List[float], np.ndarray] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + box = np.array( + [ + [x[0], y[0]], + [x[1], y[1]], + [x[2], y[2]], + [x[3], y[3]], + ], + dtype=np_dtype, + ) else: - # Reduce 8 coords to 4 + # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax box = [min(x), min(y), max(x), max(y)] - _targets.append((word['text'], box)) + _targets.append((word["text"], box)) text_targets, box_targets = zip(*_targets) - self.data.append(( - img_path, - dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets) - )) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) + ) + for crop, label in zip(crops, list(text_targets)): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -397,8 +461,8 @@

Source code for doctr.datasets.cord

       
     
   
-
- + + diff --git a/v0.3.0/_modules/doctr/datasets/core.html b/v0.3.0/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.3.0/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.3.0/_modules/doctr/datasets/datasets/tensorflow.html deleted file mode 100644 index a236abd9fe..0000000000 --- a/v0.3.0/_modules/doctr/datasets/datasets/tensorflow.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - - - - - doctr.datasets.datasets.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.datasets.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from typing import List, Any, Tuple
-import tensorflow as tf
-
-from .base import _AbstractDataset, _VisionDataset
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset(_AbstractDataset):
-
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-
-        return img, target
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset, _VisionDataset): - pass
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/_modules/doctr/datasets/detection.html b/v0.3.0/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/v0.3.0/_modules/doctr/datasets/detection.html +++ b/v0.3.0/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.3.0/_modules/doctr/datasets/doc_artefacts.html b/v0.3.0/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/v0.3.0/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.3.0/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.3.0/_modules/doctr/datasets/funsd.html b/v0.3.0/_modules/doctr/datasets/funsd.html index 35d7ad4cf5..f08612f9fa 100644 --- a/v0.3.0/_modules/doctr/datasets/funsd.html +++ b/v0.3.0/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.funsd

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['FUNSD']
+__all__ = ["FUNSD"]
 
 
 
-[docs] +[docs] class FUNSD(VisionDataset): """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. - Example:: - >>> from doctr.datasets import FUNSD - >>> train_set = FUNSD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0 + :align: center + + >>> from doctr.datasets import FUNSD + >>> train_set = FUNSD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip' - SHA256 = 'c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f' - FILE_NAME = 'funsd.zip' + URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip" + SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f" + FILE_NAME = "funsd.zip" def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + super().__init__( + self.URL, + self.FILE_NAME, + self.SHA256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - super().__init__(self.URL, self.FILE_NAME, self.SHA256, True, **kwargs) self.train = train - self.sample_transforms = sample_transforms + np_dtype = np.float32 # Use the subset - subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data') + subfolder = os.path.join("dataset", "training_data" if train else "testing_data") # # List images - self.root = os.path.join(self._root, subfolder, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + tmp_root = os.path.join(self.root, subfolder, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f: data = json.load(f) - _targets = [(word['text'], word['box']) for block in data['form'] - for word in block['words'] if len(word['text']) > 0] + _targets = [ + (word["text"], word["box"]) + for block in data["form"] + for word in block["words"] + if len(word["text"]) > 0 + ] text_targets, box_targets = zip(*_targets) - if rotated_bbox: - # box_targets: xmin, ymin, xmax, ymax -> x, y, w, h, alpha = 0 - box_targets = [ + if use_polygons: + # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners + box_targets = [ # type: ignore[assignment] [ - (box[0] + box[2]) / 2, (box[1] + box[3]) / 2, box[2] - box[0], box[3] - box[1], 0 - ] for box in box_targets + [box[0], box[1]], + [box[2], box[1]], + [box[2], box[3]], + [box[0], box[3]], + ] + for box in box_targets ] - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=int), labels=text_targets))) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype) + ) + for crop, label in zip(crops, list(text_targets)): + # filter labels with unknown characters + if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -386,8 +453,8 @@

Source code for doctr.datasets.funsd

       
     
   
-
- + + diff --git a/v0.3.0/_modules/doctr/datasets/generator/tensorflow.html b/v0.3.0/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/v0.3.0/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.3.0/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.3.0/_modules/doctr/datasets/ic03.html b/v0.3.0/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/v0.3.0/_modules/doctr/datasets/ic03.html +++ b/v0.3.0/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.3.0/_modules/doctr/datasets/ic13.html b/v0.3.0/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/v0.3.0/_modules/doctr/datasets/ic13.html +++ b/v0.3.0/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.3.0/_modules/doctr/datasets/iiit5k.html b/v0.3.0/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/v0.3.0/_modules/doctr/datasets/iiit5k.html +++ b/v0.3.0/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.3.0/_modules/doctr/datasets/iiithws.html b/v0.3.0/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/v0.3.0/_modules/doctr/datasets/iiithws.html +++ b/v0.3.0/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.3.0/_modules/doctr/datasets/imgur5k.html b/v0.3.0/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/v0.3.0/_modules/doctr/datasets/imgur5k.html +++ b/v0.3.0/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.3.0/_modules/doctr/datasets/loader.html b/v0.3.0/_modules/doctr/datasets/loader.html index d32e6da298..ed80350ef0 100644 --- a/v0.3.0/_modules/doctr/datasets/loader.html +++ b/v0.3.0/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.loader

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import math
-import tensorflow as tf
-import numpy as np
-from typing import Optional
+from typing import Callable, Optional
 
-from .multithreading import multithread_exec
+import numpy as np
+import tensorflow as tf
 
 __all__ = ["DataLoader"]
 
@@ -293,12 +314,13 @@ 

Source code for doctr.datasets.loader

     """Collate multiple elements into batches
 
     Args:
+    ----
         samples: list of N tuples containing M elements
 
     Returns:
+    -------
         Tuple of M sequences contianing N elements each
     """
-
     batch_data = zip(*samples)
 
     tf_data = tuple(tf.stack(elt, axis=0) for elt in batch_data)
@@ -307,23 +329,23 @@ 

Source code for doctr.datasets.loader

 
 
 
-[docs] +[docs] class DataLoader: """Implements a dataset wrapper for fast data loading - Example:: - >>> from doctr.datasets import FUNSD, DataLoader - >>> train_set = CORD(train=True, download=True) - >>> train_loader = DataLoader(train_set, batch_size=32) - >>> train_iter = iter(train_loader) - >>> images, targets = next(train_iter) + >>> from doctr.datasets import CORD, DataLoader + >>> train_set = CORD(train=True, download=True) + >>> train_loader = DataLoader(train_set, batch_size=32) + >>> train_iter = iter(train_loader) + >>> images, targets = next(train_iter) Args: + ---- dataset: the dataset shuffle: whether the samples should be shuffled before passing it to the iterator batch_size: number of elements in each batch drop_last: if `True`, drops the last batch if it isn't full - workers: number of workers to use for data loading + collate_fn: function to merge samples into a batch """ def __init__( @@ -332,17 +354,22 @@

Source code for doctr.datasets.loader

         shuffle: bool = True,
         batch_size: int = 1,
         drop_last: bool = False,
-        workers: Optional[int] = None,
+        collate_fn: Optional[Callable] = None,
     ) -> None:
         self.dataset = dataset
         self.shuffle = shuffle
         self.batch_size = batch_size
         nb = len(self.dataset) / batch_size
         self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
-        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
-        self.workers = workers
+        if collate_fn is None:
+            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
+        else:
+            self.collate_fn = collate_fn
         self.reset()
 
+    def __len__(self) -> int:
+        return self.num_batches
+
     def reset(self) -> None:
         # Updates indices after each epoch
         self._num_yielded = 0
@@ -358,9 +385,9 @@ 

Source code for doctr.datasets.loader

         if self._num_yielded < self.num_batches:
             # Get next indices
             idx = self._num_yielded * self.batch_size
-            indices = self.indices[idx: min(len(self.dataset), idx + self.batch_size)]
+            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
 
-            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.workers)
+            samples = list(map(self.dataset.__getitem__, indices))
 
             batch_data = self.collate_fn(samples)
 
@@ -401,8 +428,8 @@ 

Source code for doctr.datasets.loader

       
     
   
-
- +
+ diff --git a/v0.3.0/_modules/doctr/datasets/mjsynth.html b/v0.3.0/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/v0.3.0/_modules/doctr/datasets/mjsynth.html +++ b/v0.3.0/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.3.0/_modules/doctr/datasets/ocr.html b/v0.3.0/_modules/doctr/datasets/ocr.html index 11297d5952..ce1ed8b0d4 100644 --- a/v0.3.0/_modules/doctr/datasets/ocr.html +++ b/v0.3.0/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.ocr

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple
 
-from .datasets import AbstractDataset
-from doctr.utils.geometry import fit_rbbox
+import numpy as np
 
+from .datasets import AbstractDataset
 
-__all__ = ['OCRDataset']
+__all__ = ["OCRDataset"]
 
 
 
-[docs] +[docs] class OCRDataset(AbstractDataset): """Implements an OCR dataset + >>> from doctr.datasets import OCRDataset + >>> train_set = OCRDataset(img_folder="/path/to/images", + >>> label_file="/path/to/labels.json") + >>> img, target = train_set[0] + Args: + ---- img_folder: local path to image folder (all jpg at the root) label_file: local path to the label file - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) - **kwargs: keyword arguments from `VisionDataset`. + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + **kwargs: keyword arguments from `AbstractDataset`. """ def __init__( self, img_folder: str, label_file: str, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, **kwargs: Any, ) -> None: - - self.sample_transforms = sample_transforms - self.root = img_folder + super().__init__(img_folder, **kwargs) # List images self.data: List[Tuple[str, Dict[str, Any]]] = [] - with open(label_file, 'rb') as f: + np_dtype = np.float32 + with open(label_file, "rb") as f: data = json.load(f) - for file_dic in data: + for img_name, annotations in data.items(): # Get image path - img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg' + img_name = Path(img_name) # File existence check if not os.path.exists(os.path.join(self.root, img_name)): raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") # handle empty images - if (len(file_dic["coordinates"]) == 0 or - (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")): - self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[]))) + if len(annotations["typed_words"]) == 0: + self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[]))) continue - is_valid: List[bool] = [] - box_targets: List[List[float]] = [] - for box in file_dic["coordinates"]: - if rotated_bbox: - x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32)) - box = [x, y, w, h, alpha] - is_valid.append(w > 0 and h > 0) - else: - xs, ys = zip(*box) - box = [min(xs), min(ys), max(xs), max(ys)] - is_valid.append(box[0] < box[2] and box[1] < box[3]) - if is_valid[-1]: - box_targets.append(box) + # Unpack the straight boxes (xmin, ymin, xmax, ymax) + geoms = [list(map(float, obj["geometry"][:4])) for obj in annotations["typed_words"]] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + geoms = [ + [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]] # type: ignore[list-item] + for geom in geoms + ] + + text_targets = [obj["value"] for obj in annotations["typed_words"]] - text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid] - self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
+ self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))
@@ -383,8 +402,8 @@

Source code for doctr.datasets.ocr

       
     
   
- - + + diff --git a/v0.3.0/_modules/doctr/datasets/recognition.html b/v0.3.0/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/v0.3.0/_modules/doctr/datasets/recognition.html +++ b/v0.3.0/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.3.0/_modules/doctr/datasets/sroie.html b/v0.3.0/_modules/doctr/datasets/sroie.html index 66fd4ca3e0..04cf10bda2 100644 --- a/v0.3.0/_modules/doctr/datasets/sroie.html +++ b/v0.3.0/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.sroie

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import csv
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['SROIE']
+__all__ = ["SROIE"]
 
 
 
-[docs] +[docs] class SROIE(VisionDataset): """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction" <https://arxiv.org/pdf/2103.10213.pdf>`_. - Example:: - >>> from doctr.datasets import SROIE - >>> train_set = SROIE(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0 + :align: center + + >>> from doctr.datasets import SROIE + >>> train_set = SROIE(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_train_task1.zip', - 'd4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_test.zip', - '41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0", + "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f", + "sroie2019_train_task1.zip", + ) + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0", + "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2", + "sroie2019_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - self.sample_transforms = sample_transforms self.train = train - if rotated_bbox: - raise NotImplementedError + tmp_root = os.path.join(self.root, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + np_dtype = np.float32 - # # List images - self.root = os.path.join(self._root, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - _targets = [] - with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f: - for row in csv.reader(f, delimiter=','): - # Safeguard for blank lines - if len(row) > 0: - # Label may contain commas - label = ",".join(row[8:]) - # Reduce 8 coords to 4 - p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x, p4_y = map(int, row[:8]) - left, right = min(p1_x, p2_x, p3_x, p4_x), max(p1_x, p2_x, p3_x, p4_x) - top, bot = min(p1_y, p2_y, p3_y, p4_y), max(p1_y, p2_y, p3_y, p4_y) - if len(label) > 0: - _targets.append((label, [left, top, right, bot])) - - text_targets, box_targets = zip(*_targets) - - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets))) + with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f: + _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0] + + labels = [",".join(row[8:]) for row in _rows] + # reorder coordinates (8 -> (4,2) -> + # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines + coords: np.ndarray = np.stack( + [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0 + ) + + if not use_polygons: + # xmin, ymin, xmax, ymax + coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1) + + if recognition_task: + crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords) + for crop, label in zip(crops, labels): + if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0: + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, coords)) + else: + self.data.append((img_path, dict(boxes=coords, labels=labels))) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -390,8 +444,8 @@

Source code for doctr.datasets.sroie

       
     
   
-
- + + diff --git a/v0.3.0/_modules/doctr/datasets/svhn.html b/v0.3.0/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/v0.3.0/_modules/doctr/datasets/svhn.html +++ b/v0.3.0/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.3.0/_modules/doctr/datasets/svt.html b/v0.3.0/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/v0.3.0/_modules/doctr/datasets/svt.html +++ b/v0.3.0/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.3.0/_modules/doctr/datasets/synthtext.html b/v0.3.0/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/v0.3.0/_modules/doctr/datasets/synthtext.html +++ b/v0.3.0/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.3.0/_modules/doctr/datasets/utils.html b/v0.3.0/_modules/doctr/datasets/utils.html index 2259698c0f..bde9304597 100644 --- a/v0.3.0/_modules/doctr/datasets/utils.html +++ b/v0.3.0/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.utils

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import string
 import unicodedata
+from collections.abc import Sequence
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Sequence as SequenceType
+
 import numpy as np
-from typing import List, Optional, Any
+from PIL import Image
+
+from doctr.io.image import get_img_shape
+from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
 
 from .vocabs import VOCABS
 
-__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
+__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences", "pre_transform_multiclass"]
+
+ImageTensor = TypeVar("ImageTensor")
 
 
 def translate(
     input_string: str,
     vocab_name: str,
-    unknown_char: str = '■',
+    unknown_char: str = "■",
 ) -> str:
     """Translate a string input in a given vocabulary
 
     Args:
+    ----
         input_string: input string to translate
         vocab_name: vocabulary to use (french, latin, ...)
         unknown_char: unknown character for non-translatable characters
 
     Returns:
-        A string translated in a given vocab"""
-
+    -------
+        A string translated in a given vocab
+    """
     if VOCABS.get(vocab_name) is None:
         raise KeyError("output vocabulary must be in vocabs dictionnary")
 
-    translated = ''
+    translated = ""
     for char in input_string:
         if char not in VOCABS[vocab_name]:
             # we need to translate char into a vocab char
@@ -315,51 +350,63 @@ 

Source code for doctr.datasets.utils

                 # remove whitespaces
                 continue
             # normalize character if it is not in vocab
-            char = unicodedata.normalize('NFD', char).encode('ascii', 'ignore').decode('ascii')
-            if char == '' or char not in VOCABS[vocab_name]:
+            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
+            if char == "" or char not in VOCABS[vocab_name]:
                 # if normalization fails or char still not in vocab, return unknown character)
                 char = unknown_char
         translated += char
     return translated
 
 
-def encode_sequence(
+def encode_string(
     input_string: str,
     vocab: str,
 ) -> List[int]:
     """Given a predefined mapping, encode the string to a sequence of numbers
 
     Args:
+    ----
         input_string: string to encode
         vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A list encoding the input_string"""
-
-    return list(map(vocab.index, input_string))  # type: ignore[arg-type]
+    -------
+        A list encoding the input_string
+    """
+    try:
+        return list(map(vocab.index, input_string))
+    except ValueError:
+        raise ValueError(
+            f"some characters cannot be found in 'vocab'. \
+                         Please check the input string {input_string} and the vocabulary {vocab}"
+        )
 
 
 def decode_sequence(
-    input_array: np.array,
+    input_seq: Union[np.ndarray, SequenceType[int]],
     mapping: str,
 ) -> str:
     """Given a predefined mapping, decode the sequence of numbers to a string
 
     Args:
-        input_array: array to decode
+    ----
+        input_seq: array to decode
         mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A string, decoded from input_array"""
-
-    if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
+    -------
+        A string, decoded from input_seq
+    """
+    if not isinstance(input_seq, (Sequence, np.ndarray)):
+        raise TypeError("Invalid sequence type")
+    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
         raise AssertionError("Input must be an array of int, with max less than mapping size")
-    decoded = ''.join(mapping[idx] for idx in input_array)
-    return decoded
+
+    return "".join(map(mapping.__getitem__, input_seq))
 
 
 
-[docs] +[docs] def encode_sequences( sequences: List[str], vocab: str, @@ -367,48 +414,53 @@

Source code for doctr.datasets.utils

     eos: int = -1,
     sos: Optional[int] = None,
     pad: Optional[int] = None,
-    **kwargs: Any,
+    dynamic_seq_length: bool = False,
 ) -> np.ndarray:
     """Encode character sequences using a given vocab as mapping
 
     Args:
+    ----
         sequences: the list of character sequences of size N
         vocab: the ordered vocab to use for encoding
         target_size: maximum length of the encoded data
         eos: encoding of End Of String
         sos: optional encoding of Start Of String
         pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
+        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
 
     Returns:
+    -------
         the padded encoded data as a tensor
     """
-
     if 0 <= eos < len(vocab):
         raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
 
-    if not isinstance(target_size, int):
-        target_size = max(len(w) for w in sequences)
-        if sos:
-            target_size += 1
-        if pad:
-            target_size += 1
+    if not isinstance(target_size, int) or dynamic_seq_length:
+        # Maximum string length + EOS
+        max_length = max(len(w) for w in sequences) + 1
+        if isinstance(sos, int):
+            max_length += 1
+        if isinstance(pad, int):
+            max_length += 1
+        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
 
     # Pad all sequences
-    if pad:  # pad with padding symbol
+    if isinstance(pad, int):  # pad with padding symbol
         if 0 <= pad < len(vocab):
             raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
         # In that case, add EOS at the end of the word before padding
-        encoded_data = np.full([len(sequences), target_size], pad, dtype=np.int32)
+        default_symbol = pad
     else:  # pad with eos symbol
-        encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32)
+        default_symbol = eos
+    encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
 
-    for idx, seq in enumerate(sequences):
-        encoded_seq = encode_sequence(seq, vocab)
-        if pad:  # add eos at the end of the sequence
-            encoded_seq.append(eos)
-        encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)]
+    # Encode the strings
+    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
+        if isinstance(pad, int):  # add eos at the end of the sequence
+            seq.append(eos)
+        encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)]
 
-    if sos:  # place eos symbol at the beginning of each sequence
+    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
         if 0 <= sos < len(vocab):
             raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
         encoded_data = np.roll(encoded_data, 1)
@@ -416,6 +468,59 @@ 

Source code for doctr.datasets.utils

 
     return encoded_data
+ + +def convert_target_to_relative( + img: ImageTensor, target: Union[np.ndarray, Dict[str, Any]] +) -> Tuple[ImageTensor, Union[Dict[str, Any], np.ndarray]]: + if isinstance(target, np.ndarray): + target = convert_to_relative_coords(target, get_img_shape(img)) + else: + target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img)) + return img, target + + +def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]: + """Crop a set of bounding boxes from an image + + Args: + ---- + img_path: path to the image + geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4) + + Returns: + ------- + a list of cropped images + """ + with Image.open(img_path) as pil_img: + img: np.ndarray = np.array(pil_img.convert("RGB")) + # Polygon + if geoms.ndim == 3 and geoms.shape[1:] == (4, 2): + return extract_rcrops(img, geoms.astype(dtype=int)) + if geoms.ndim == 2 and geoms.shape[1] == 4: + return extract_crops(img, geoms.astype(dtype=int)) + raise ValueError("Invalid geometry format") + + +def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.ndarray, Dict[str, List]]: + """Converts multiclass target to relative coordinates. + + Args: + ---- + img: Image + target: tuple of target polygons and their classes names + + Returns: + ------- + Image and dictionary of boxes, with class names as keys + """ + boxes = convert_to_relative_coords(target[0], get_img_shape(img)) + boxes_classes = target[1] + boxes_dict: Dict = {k: [] for k in sorted(set(boxes_classes))} + for k, poly in zip(boxes_classes, boxes): + boxes_dict[k].append(poly) + boxes_dict = {k: np.stack(v, axis=0) for k, v in boxes_dict.items()} + return img, boxes_dict
@@ -448,8 +553,8 @@

Source code for doctr.datasets.utils

       
     
   
- - + + diff --git a/v0.3.0/_modules/doctr/datasets/wildreceipt.html b/v0.3.0/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.3.0/_modules/doctr/datasets/wildreceipt.html +++ b/v0.3.0/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.3.0/_modules/doctr/documents/elements.html b/v0.3.0/_modules/doctr/documents/elements.html deleted file mode 100644 index 10c1e142d2..0000000000 --- a/v0.3.0/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,577 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox, RotatedBbox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator, arg-type] - - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/_modules/doctr/documents/reader.html b/v0.3.0/_modules/doctr/documents/reader.html deleted file mode 100644 index cdcd814b6c..0000000000 --- a/v0.3.0/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,612 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence, Dict
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/_modules/doctr/io/elements.html b/v0.3.0/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/v0.3.0/_modules/doctr/io/elements.html +++ b/v0.3.0/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.3.0/_modules/doctr/io/html.html b/v0.3.0/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/v0.3.0/_modules/doctr/io/html.html +++ b/v0.3.0/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.3.0/_modules/doctr/io/image/base.html b/v0.3.0/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/v0.3.0/_modules/doctr/io/image/base.html +++ b/v0.3.0/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.3.0/_modules/doctr/io/image/tensorflow.html b/v0.3.0/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/v0.3.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.3.0/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.3.0/_modules/doctr/io/pdf.html b/v0.3.0/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/v0.3.0/_modules/doctr/io/pdf.html +++ b/v0.3.0/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.3.0/_modules/doctr/io/reader.html b/v0.3.0/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/v0.3.0/_modules/doctr/io/reader.html +++ b/v0.3.0/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.3.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/v0.3.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.3.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/v0.3.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.3.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/v0.3.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.3.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.3.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.3.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/v0.3.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.3.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/v0.3.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.3.0/_modules/doctr/models/classification/zoo.html b/v0.3.0/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/v0.3.0/_modules/doctr/models/classification/zoo.html +++ b/v0.3.0/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.3.0/_modules/doctr/models/detection/differentiable_binarization.html b/v0.3.0/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.3.0/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.3.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 9145c7c3fd..66cef8663d 100644 --- a/v0.3.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import List, Tuple, Optional, Any, Dict
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+from tensorflow.keras.applications import ResNet50
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
+
+from ...classification import mobilenet_v3_large
 from .base import DBPostProcessor, _DBNet
 
-__all__ = ['DBNet', 'db_resnet50']
+__all__ = ["DBNet", "db_resnet50", "db_mobilenet_v3_large"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
+    "db_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_resnet50-649fa22b.weights.h5&src=0",
+    },
+    "db_mobilenet_v3_large": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_mobilenet_v3_large-ee2e1dbe.weights.h5&src=0",
     },
 }
 
@@ -313,6 +348,7 @@ 

Source code for doctr.models.detection.differentiable_binarization.tensorflo <https://arxiv.org/pdf/1612.03144.pdf>`_. Args: + ---- channels: number of channel to output """ @@ -322,9 +358,9 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo ) -> None: super().__init__() self.channels = channels - self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest') - self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)] - self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)] + self.upsample = layers.UpSampling2D(size=(2, 2), interpolation="nearest") + self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer="he_normal") for _ in range(4)] + self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2**idx) for idx in range(4)] @staticmethod def build_upsampling( @@ -334,20 +370,21 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo """Module which performs a 3x3 convolution followed by up-sampling Args: + ---- channels: number of output channels dilation_factor (int): dilation factor to scale the convolution output before concatenation Returns: + ------- a keras.layers.Layer object, wrapping these operations in a sequential module """ - - _layers = conv_sequence(channels, 'relu', True, kernel_size=3) + _layers = conv_sequence(channels, "relu", True, kernel_size=3) if dilation_factor > 1: - _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest')) + _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation="nearest")) - module = keras.Sequential(_layers) + module = Sequential(_layers) return module @@ -359,7 +396,6 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo x: List[tf.Tensor], **kwargs: Any, ) -> tf.Tensor: - # Channel mapping results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)] # Upsample & sum @@ -371,200 +407,324 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo return layers.concatenate(results) -class DBNet(_DBNet, keras.Model, NestedObject): +class DBNet(_DBNet, Model, NestedObject): """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_. Args: + ---- feature extractor: the backbone serving as feature extractor fpn_channels: number of channels each extracted feature maps is mapped to + bin_thresh: threshold for binarization + box_thresh: minimal objectness score to consider a box + assume_straight_pages: if True, fit straight bounding boxes only + exportable: onnx exportable returns only logits + cfg: the configuration dict of the model + class_names: list of class names """ - _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "fpn", "probability_head", "threshold_head", "postprocessor"] def __init__( self, feature_extractor: IntermediateLayerGetter, - fpn_channels: int = 128, - rotated_bbox: bool = False, + fpn_channels: int = 128, # to be set to 256 to represent the author's initial idea + bin_thresh: float = 0.3, + box_thresh: float = 0.1, + assume_straight_pages: bool = True, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, + class_names: List[str] = [CLASS_NAME], ) -> None: - super().__init__() + self.class_names = class_names + num_classes: int = len(self.class_names) self.cfg = cfg self.feat_extractor = feature_extractor - self.rotated_bbox = rotated_bbox + self.exportable = exportable + self.assume_straight_pages = assume_straight_pages self.fpn = FeaturePyramidNetwork(channels=fpn_channels) # Initialize kernels _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape] output_shape = tuple(self.fpn(_inputs).shape) - self.probability_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] + self.probability_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + self.threshold_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + + self.postprocessor = DBPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh ) - self.threshold_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] - ) - - self.postprocessor = DBPostProcessor(rotated_bbox=rotated_bbox) def compute_loss( self, out_map: tf.Tensor, thresh_map: tf.Tensor, - target: List[Dict[str, Any]] + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes and a list of masks for each image. From there it computes the loss with the model output Args: + ---- out_map: output feature map of the model of shape (N, H, W, C) thresh_map: threshold map of shape (N, H, W, C) target: list of dictionary where each dict has a `boxes` and a `flags` entry + gamma: modulating factor in the focal loss formula + alpha: balancing factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") - prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1])) - thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1])) + prob_map = tf.math.sigmoid(out_map) + thresh_map = tf.math.sigmoid(thresh_map) - seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) + seg_target, seg_mask, thresh_target, thresh_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32) + seg_mask = tf.cast(seg_mask, tf.float32) + thresh_target = tf.convert_to_tensor(thresh_target, dtype=out_map.dtype) thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool) - # Compute balanced BCE loss for proba_map - bce_scale = 5. - bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask] - - neg_target = 1 - seg_target[seg_mask] - positive_count = tf.math.reduce_sum(seg_target[seg_mask]) - negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count]) - negative_loss = bce_loss * neg_target - negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32)) - sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss) - balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6) - - # Compute dice loss for approxbin_map - bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask]))) - - bce_min = tf.math.reduce_min(bce_loss) - weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1. - inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights) - union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8 - dice_loss = 1 - 2.0 * inter / union + # Focal loss + focal_scale = 10.0 + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + + # Convert logits to prob, compute gamma factor + p_t = (seg_target * prob_map) + ((1 - seg_target) * (1 - prob_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class or for approx binary_map + if len(self.class_names) > 1: + dice_map = tf.nn.softmax(out_map, axis=-1) + else: + # compute binary map instead + dice_map = 1.0 / (1.0 + tf.exp(-50 * (prob_map - thresh_map))) + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) # Compute l1 loss for thresh_map - l1_scale = 10. if tf.reduce_any(thresh_mask): - l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask])) + thresh_mask = tf.cast(thresh_mask, tf.float32) + l1_loss = tf.reduce_sum(tf.abs(thresh_map - thresh_target) * thresh_mask) / ( + tf.reduce_sum(thresh_mask) + eps + ) else: - l1_loss = tf.constant(0.) + l1_loss = tf.constant(0.0) - return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss + return l1_loss + focal_scale * focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - feat_maps = self.feat_extractor(x, **kwargs) feat_concat = self.fpn(feat_maps, **kwargs) logits = self.probability_head(feat_concat, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: - # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + if target is None or return_preds: + # Post-process boxes (keep only text predictions) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: thresh_map = self.threshold_head(feat_concat, **kwargs) loss = self.compute_loss(logits, thresh_map, target) - out['loss'] = loss + out["loss"] = loss return out -def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet: +def _db_resnet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) # Feature extractor - resnet = tf.keras.applications.__dict__[_cfg['backbone']]( - include_top=False, - weights=None, - input_shape=_cfg['input_shape'], - pooling=None, + feat_extractor = IntermediateLayerGetter( + backbone_fn( + weights="imagenet" if pretrained_backbone else None, + include_top=False, + pooling=None, + input_shape=_cfg["input_shape"], + ), + fpn_layers, ) + # Build the model + model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + + # Load pretrained parameters + if pretrained: + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) + + return model + + +def _db_mobilenet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained + + # Patch the config + _cfg = deepcopy(default_cfgs[arch]) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = default_cfgs[arch].get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor feat_extractor = IntermediateLayerGetter( - resnet, - _cfg['fpn_layers'], + backbone_fn( + input_shape=_cfg["input_shape"], + include_top=False, + pretrained=pretrained_backbone, + ), + fpn_layers, ) - kwargs['fpn_channels'] = _cfg['fpn_channels'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] - # Build the model model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model
-[docs] +[docs] def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture Returns: + ------- text detection architecture """ + return _db_resnet( + "db_resnet50", + pretrained, + ResNet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
+ + + +
+[docs] +def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet: + """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" + <https://arxiv.org/pdf/1911.08947.pdf>`_, using a mobilenet v3 large backbone. + + >>> import tensorflow as tf + >>> from doctr.models import db_mobilenet_v3_large + >>> model = db_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) - return _db_resnet('db_resnet50', pretrained, **kwargs)
+ Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture + + Returns: + ------- + text detection architecture + """ + return _db_mobilenet( + "db_mobilenet_v3_large", + pretrained, + mobilenet_v3_large, + ["inverted_2", "inverted_5", "inverted_11", "final_block"], + **kwargs, + )

@@ -598,8 +758,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo - - + + diff --git a/v0.3.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.3.0/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.3.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.3.0/_modules/doctr/models/detection/linknet.html b/v0.3.0/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.3.0/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.3.0/_modules/doctr/models/detection/linknet/tensorflow.html index cd4f446673..ce995f99d4 100644 --- a/v0.3.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.linknet.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.classification import resnet18, resnet34, resnet50
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.backbones import ResnetStage
-from doctr.models.utils import conv_sequence, load_pretrained_params
-from .base import LinkNetPostProcessor, _LinkNet
 
-__all__ = ['LinkNet', 'linknet16']
+from .base import LinkNetPostProcessor, _LinkNet
 
+__all__ = ["LinkNet", "linknet_resnet18", "linknet_resnet34", "linknet_resnet50"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet16': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'num_classes': 1,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': None,
+    "linknet_resnet18": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet18-615a82c5.weights.h5&src=0",
+    },
+    "linknet_resnet34": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet34-9d772be5.weights.h5&src=0",
+    },
+    "linknet_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet50-6bf6c8b5.weights.h5&src=0",
     },
 }
 
 
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
+def decoder_block(in_chan: int, out_chan: int, stride: int, **kwargs: Any) -> Sequential:
     """Creates a LinkNet decoder block"""
-
     return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
+        *conv_sequence(in_chan // 4, "relu", True, kernel_size=1, **kwargs),
         layers.Conv2DTranspose(
             filters=in_chan // 4,
             kernel_size=3,
-            strides=2,
+            strides=stride,
             padding="same",
             use_bias=False,
-            kernel_initializer='he_normal'
+            kernel_initializer="he_normal",
         ),
         layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
+        layers.Activation("relu"),
+        *conv_sequence(out_chan, "relu", True, kernel_size=1),
     ])
 
 
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module"""
+class LinkNetFPN(Model, NestedObject):
+    """LinkNet Decoder module"""
 
     def __init__(
         self,
+        out_chans: int,
+        in_shapes: List[Tuple[int, ...]],
     ) -> None:
-
         super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
+        self.out_chans = out_chans
+        strides = [2] * (len(in_shapes) - 1) + [1]
+        i_chans = [s[-1] for s in in_shapes[::-1]]
+        o_chans = i_chans[1:] + [out_chans]
+        self.decoders = [
+            decoder_block(in_chan, out_chan, s, input_shape=in_shape)
+            for in_chan, out_chan, s, in_shape in zip(i_chans, o_chans, strides, in_shapes[::-1])
+        ]
+
+    def call(self, x: List[tf.Tensor], **kwargs: Any) -> tf.Tensor:
+        out = 0
+        for decoder, fmap in zip(self.decoders, x[::-1]):
+            out = decoder(out + fmap, **kwargs)
+        return out
 
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(_LinkNet, keras.Model):
+    def extra_repr(self) -> str:
+        return f"out_chans={self.out_chans}"
+
+
+class LinkNet(_LinkNet, Model):
     """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
     <https://arxiv.org/pdf/1707.03718.pdf>`_.
 
     Args:
-        num_classes: number of channels for the output
+    ----
+        feature extractor: the backbone serving as feature extractor
+        fpn_channels: number of channels each extracted feature maps is mapped to
+        bin_thresh: threshold for binarization of the output feature map
+        box_thresh: minimal objectness score to consider a box
+        assume_straight_pages: if True, fit straight bounding boxes only
+        exportable: onnx exportable returns only logits
+        cfg: the configuration dict of the model
+        class_names: list of class names
     """
 
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
+    _children_names: List[str] = ["feat_extractor", "fpn", "classifier", "postprocessor"]
 
     def __init__(
         self,
-        num_classes: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        rotated_bbox: bool = False,
+        feat_extractor: IntermediateLayerGetter,
+        fpn_channels: int = 64,
+        bin_thresh: float = 0.1,
+        box_thresh: float = 0.1,
+        assume_straight_pages: bool = True,
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
+        class_names: List[str] = [CLASS_NAME],
     ) -> None:
         super().__init__(cfg=cfg)
 
-        self.rotated_bbox = rotated_bbox
+        self.class_names = class_names
+        num_classes: int = len(self.class_names)
 
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
+        self.exportable = exportable
+        self.assume_straight_pages = assume_straight_pages
+
+        self.feat_extractor = feat_extractor
 
-        self.fpn = LinkNetFPN()
+        self.fpn = LinkNetFPN(fpn_channels, [_shape[1:] for _shape in self.feat_extractor.output_shape])
+        self.fpn.build(self.feat_extractor.output_shape)
 
         self.classifier = Sequential([
             layers.Conv2DTranspose(
@@ -393,154 +442,246 @@ 

Source code for doctr.models.detection.linknet.tensorflow

strides=2, padding="same", use_bias=False, - kernel_initializer='he_normal' + kernel_initializer="he_normal", + input_shape=self.fpn.decoders[-1].output_shape[1:], ), layers.BatchNormalization(), - layers.Activation('relu'), - *conv_sequence(32, 'relu', True, strides=1, kernel_size=3), + layers.Activation("relu"), + *conv_sequence(32, "relu", True, kernel_size=3, strides=1), layers.Conv2DTranspose( filters=num_classes, kernel_size=2, strides=2, padding="same", - use_bias=False, - kernel_initializer='he_normal' + use_bias=True, + kernel_initializer="he_normal", ), ]) - self.postprocessor = LinkNetPostProcessor(rotated_bbox=rotated_bbox) + self.postprocessor = LinkNetPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh + ) def compute_loss( self, out_map: tf.Tensor, - target: List[Dict[str, Any]], - focal_loss: bool = False, - alpha: float = .5, - gamma: float = 2., - edge_factor: float = 2., + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute linknet loss, BCE with boosted box edges or focal loss. Focal loss implementation based on <https://github.com/tensorflow/addons/>`_. Args: + ---- out_map: output feature map of the model of shape N x H x W x 1 target: list of dictionary where each dict has a `boxes` and a `flags` entry - focal_loss: if True, use focal loss instead of BCE - edge_factor: boost factor for box edges (in case of BCE) + gamma: modulating factor in the focal loss formula alpha: balancing factor in the focal loss formula - gammma: modulating factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ - seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) - edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) + seg_target, seg_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - - # Get the cross_entropy for each entry - bce = tf.keras.losses.binary_crossentropy( - seg_target[seg_mask], - tf.squeeze(out_map, axis=[-1])[seg_mask], - from_logits=True) - - if focal_loss: - if gamma and gamma < 0: - raise ValueError("Value of gamma should be greater than or equal to zero.") - - # Convert logits to prob, compute gamma factor - pred_prob = tf.sigmoid(tf.squeeze(out_map, axis=[-1])[seg_mask]) - p_t = (seg_target[seg_mask] * pred_prob) + ((1 - seg_target[seg_mask]) * (1 - pred_prob)) - modulating_factor = tf.pow((1.0 - p_t), gamma) - - # Compute alpha factor - alpha_factor = seg_target[seg_mask] * alpha + (1 - seg_target[seg_mask]) * (1 - alpha) - - # compute the final loss - loss = tf.reduce_mean(alpha_factor * modulating_factor * bce) - - else: - # Compute BCE loss with highlighted edges - loss = tf.math.multiply( - 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), - bce - ) - loss = tf.reduce_mean(loss) - - return loss + seg_mask = tf.cast(seg_mask, tf.float32) + + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + proba_map = tf.sigmoid(out_map) + + # Focal loss + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") + # Convert logits to prob, compute gamma factor + p_t = (seg_target * proba_map) + ((1 - seg_target) * (1 - proba_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class + dice_map = tf.nn.softmax(out_map, axis=-1) if len(self.class_names) > 1 else proba_map + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) + + return focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, - focal_loss: bool = True, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - - logits = self.stem(x) - logits = self.fpn(logits) - logits = self.classifier(logits) + feat_maps = self.feat_extractor(x, **kwargs) + logits = self.fpn(feat_maps, **kwargs) + logits = self.classifier(logits, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) + if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: + if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: - loss = self.compute_loss(logits, target, focal_loss) - out['loss'] = loss + loss = self.compute_loss(logits, target) + out["loss"] = loss return out -def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet: +def _linknet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> LinkNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor + feat_extractor = IntermediateLayerGetter( + backbone_fn( + pretrained=pretrained_backbone, + include_top=False, + input_shape=_cfg["input_shape"], + ), + fpn_layers, + ) - kwargs['num_classes'] = _cfg['num_classes'] - kwargs['input_shape'] = _cfg['input_shape'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model - model = LinkNet(cfg=_cfg, **kwargs) + model = LinkNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model -
-[docs] -def linknet16(pretrained: bool = False, **kwargs: Any) -> LinkNet: +
+[docs] +def linknet_resnet18(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet18 + >>> model = linknet_resnet18(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture + + Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet18", + pretrained, + resnet18, + ["resnet_block_1", "resnet_block_3", "resnet_block_5", "resnet_block_7"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet34(pretrained: bool = False, **kwargs: Any) -> LinkNet: """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" <https://arxiv.org/pdf/1707.03718.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet16 - >>> model = linknet16(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet34 + >>> model = linknet_resnet34(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture Returns: + ------- text detection architecture """ + return _linknet( + "linknet_resnet34", + pretrained, + resnet34, + ["resnet_block_2", "resnet_block_6", "resnet_block_12", "resnet_block_15"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet50(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet50 + >>> model = linknet_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture - return _linknet('linknet16', pretrained, **kwargs)
+ Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet50", + pretrained, + resnet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
@@ -574,8 +715,8 @@

Source code for doctr.models.detection.linknet.tensorflow

- +
+ diff --git a/v0.3.0/_modules/doctr/models/detection/zoo.html b/v0.3.0/_modules/doctr/models/detection/zoo.html index d3128b8d14..3651c4e2d3 100644 --- a/v0.3.0/_modules/doctr/models/detection/zoo.html +++ b/v0.3.0/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
 from doctr.file_utils import is_tf_available, is_torch_available
-from .core import DetectionPredictor
-from ..preprocessor import PreProcessor
-from .. import detection
 
+from .. import detection
+from ..detection.fast import reparameterize
+from ..preprocessor import PreProcessor
+from .predictor import DetectionPredictor
 
 __all__ = ["detection_predictor"]
 
+ARCHS: List[str]
+
 
 if is_tf_available():
-    ARCHS = ['db_resnet50', 'linknet16']
+    ARCHS = [
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
 elif is_torch_available():
-    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
+    ARCHS = [
+        "db_resnet34",
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
+
 
+def _predictor(arch: Any, pretrained: bool, assume_straight_pages: bool = True, **kwargs: Any) -> DetectionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
+        _model = detection.__dict__[arch](
+            pretrained=pretrained,
+            pretrained_backbone=kwargs.get("pretrained_backbone", True),
+            assume_straight_pages=assume_straight_pages,
+        )
+        # Reparameterize FAST models by default to lower inference latency and memory usage
+        if isinstance(_model, detection.FAST):
+            _model = reparameterize(_model)
+    else:
+        if not isinstance(arch, (detection.DBNet, detection.LinkNet, detection.FAST)):
+            raise ValueError(f"unknown architecture: {type(arch)}")
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+        _model = arch
+        _model.assume_straight_pages = assume_straight_pages
+        _model.postprocessor.assume_straight_pages = assume_straight_pages
 
-    # Detection
-    _model = detection.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 1)
+    kwargs.pop("pretrained_backbone", None)
+
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 2)
     predictor = DetectionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
-        _model
+        PreProcessor(_model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:], **kwargs),
+        _model,
     )
     return predictor
 
 
 
-[docs] -def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: +[docs] +def detection_predictor( + arch: Any = "fast_base", + pretrained: bool = False, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + batch_size: int = 2, + **kwargs: Any, +) -> DetectionPredictor: """Text detection architecture. - Example:: - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> model = detection_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import detection_predictor + >>> model = detection_predictor(arch='db_resnet50', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_resnet50') + ---- + arch: name of the architecture or model itself to use (e.g. 'db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset + assume_straight_pages: If True, fit straight boxes to the page + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right + batch_size: number of samples the model processes in parallel + **kwargs: optional keyword arguments passed to the architecture Returns: + ------- Detection predictor """ - - return _predictor(arch, pretrained, **kwargs)
+ return _predictor( + arch=arch, + pretrained=pretrained, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + batch_size=batch_size, + **kwargs, + )
@@ -367,8 +449,8 @@

Source code for doctr.models.detection.zoo

       
     
   
- - + + diff --git a/v0.3.0/_modules/doctr/models/export.html b/v0.3.0/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.3.0/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/_modules/doctr/models/factory/hub.html b/v0.3.0/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/v0.3.0/_modules/doctr/models/factory/hub.html +++ b/v0.3.0/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.3.0/_modules/doctr/models/recognition/crnn.html b/v0.3.0/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.3.0/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.3.0/_modules/doctr/models/recognition/crnn/tensorflow.html index 41cc93dd23..bc64da9a1b 100644 --- a/v0.3.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.crnn.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
+
 import tensorflow as tf
 from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential, Model
-from typing import Tuple, Dict, Any, Optional, List
+from tensorflow.keras.models import Model, Sequential
+
+from doctr.datasets import VOCABS
 
-from ... import backbones
-from ...utils import load_pretrained_params
+from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
+__all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
+    "crnn_vgg16_bn": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["legacy_french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_vgg16_bn-9c188f45.weights.h5&src=0",
     },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
+    "crnn_mobilenet_v3_small": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_small-54850265.weights.h5&src=0",
+    },
+    "crnn_mobilenet_v3_large": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_large-c64045e5.weights.h5&src=0",
     },
 }
 
 
 class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
+    """Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
 
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
         ignore_case: if True, ignore case of letters
         ignore_accents: if True, ignore accents of letters
@@ -325,37 +353,57 @@ 

Source code for doctr.models.recognition.crnn.tensorflow

def __call__( self, - logits: tf.Tensor - ) -> List[Tuple[str, float]]: - """ - Performs decoding of raw output with CTC and decoding of CTC predictions + logits: tf.Tensor, + beam_width: int = 1, + top_paths: int = 1, + ) -> Union[List[Tuple[str, float]], List[Tuple[List[str], List[float]]]]: + """Performs decoding of raw output with CTC and decoding of CTC predictions with label_to_idx mapping dictionnary Args: + ---- logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1 + beam_width: An int scalar >= 0 (beam search beam width). + top_paths: An int scalar >= 0, <= beam_width (controls output size). Returns: + ------- A list of decoded words of length BATCH_SIZE + """ # Decode CTC _decoded, _log_prob = tf.nn.ctc_beam_search_decoder( tf.transpose(logits, perm=[1, 0, 2]), - tf.fill(logits.shape[0], logits.shape[1]), - beam_width=1, top_paths=1, + tf.fill(tf.shape(logits)[:1], tf.shape(logits)[1]), + beam_width=beam_width, + top_paths=top_paths, ) - out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab)) - probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) + + _decoded = tf.sparse.concat( + 1, + [tf.sparse.expand_dims(dec, axis=1) for dec in _decoded], + expand_nonconcat_dims=True, + ) # dim : batchsize x beamwidth x actual_max_len_predictions + out_idxs = tf.sparse.to_dense(_decoded, default_value=len(self.vocab)) # Map it to characters _decoded_strings_pred = tf.strings.reduce_join( inputs=tf.nn.embedding_lookup(tf.constant(self._embedding, dtype=tf.string), out_idxs), - axis=-1 + axis=-1, ) _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] - word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - + decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value="not valid")[ + :, :, 0 + ] # dim : batch_size x beam_width + + if top_paths == 1: + probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) # dim : batchsize + decoded_strings_pred = tf.squeeze(decoded_strings_pred, axis=1) + word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] + else: + probs = tf.math.exp(_log_prob) # dim : batchsize x beamwidth + word_values = [[word.decode() for word in words] for words in decoded_strings_pred.numpy().tolist()] return list(zip(word_values, probs.numpy().tolist())) @@ -364,19 +412,26 @@

Source code for doctr.models.recognition.crnn.tensorflow

Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of units in the LSTM layers + exportable: onnx exportable returns only logits + beam_width: beam width for beam search decoding + top_paths: number of top paths for beam search decoding cfg: configuration dictionary """ - _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "decoder", "postprocessor"] def __init__( self, - feature_extractor: tf.keras.Model, + feature_extractor: Model, vocab: str, rnn_units: int = 128, + exportable: bool = False, + beam_width: int = 1, + top_paths: int = 1, cfg: Optional[Dict[str, Any]] = None, ) -> None: # Initialize kernels @@ -386,19 +441,21 @@

Source code for doctr.models.recognition.crnn.tensorflow

self.vocab = vocab self.max_length = w self.cfg = cfg + self.exportable = exportable self.feat_extractor = feature_extractor - self.decoder = Sequential( - [ - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Dense(units=len(vocab) + 1) - ] - ) + self.decoder = Sequential([ + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Dense(units=len(vocab) + 1), + ]) self.decoder.build(input_shape=(None, w, h * c)) self.postprocessor = CTCPostProcessor(vocab=vocab) + self.beam_width = beam_width + self.top_paths = top_paths + def compute_loss( self, model_output: tf.Tensor, @@ -407,16 +464,17 @@

Source code for doctr.models.recognition.crnn.tensorflow

"""Compute CTC loss for the model. Args: - gt: the encoded tensor with gt labels + ---- model_output: predicted logits of the model - seq_len: lengths of each gt word inside the batch + target: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) batch_len = model_output.shape[0] - input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) + input_length = tf.fill((batch_len,), model_output.shape[1]) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -428,8 +486,12 @@

Source code for doctr.models.recognition.crnn.tensorflow

target: Optional[List[str]] = None, return_model_output: bool = False, return_preds: bool = False, + beam_width: int = 1, + top_paths: int = 1, **kwargs: Any, ) -> Dict[str, Any]: + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") features = self.feat_extractor(x, **kwargs) # B x H x W x C --> B x W x H x C @@ -437,91 +499,132 @@

Source code for doctr.models.recognition.crnn.tensorflow

w, h, c = transposed_feat.get_shape().as_list()[1:] # B x W x H x C --> B x W x H * C features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c)) - logits = self.decoder(features_seq, **kwargs) + logits = _bf16_to_float32(self.decoder(features_seq, **kwargs)) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = logits + return out + if return_model_output: out["out_map"] = logits if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(logits) + out["preds"] = self.postprocessor(logits, beam_width=beam_width, top_paths=top_paths) if target is not None: - out['loss'] = self.compute_loss(logits, target) + out["loss"] = self.compute_loss(logits, target) return out -def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: +def _crnn( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> CRNN: + pretrained_backbone = pretrained_backbone and not pretrained + + kwargs["vocab"] = kwargs.get("vocab", default_cfgs[arch]["vocab"]) - # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) + _cfg["vocab"] = kwargs["vocab"] + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] - # Feature extractor - feat_extractor = backbones.__dict__[_cfg['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + input_shape=_cfg["input_shape"], include_top=False, + pretrained=pretrained_backbone, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - # Build the model model = CRNN(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params(model, _cfg["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]) return model
-[docs] +[docs] def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_vgg16_bn + >>> model = crnn_vgg16_bn(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_vgg16_bn", pretrained, vgg16_bn_r, **kwargs)
+ + + +
+[docs] +def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based + Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_small + >>> model = crnn_mobilenet_v3_small(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
+ Returns: + ------- + text recognition architecture + """ + return _crnn("crnn_mobilenet_v3_small", pretrained, mobilenet_v3_small_r, **kwargs)
-def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based +
+[docs] +def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_large + >>> model = crnn_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_mobilenet_v3_large", pretrained, mobilenet_v3_large_r, **kwargs)
- return _crnn('crnn_resnet31', pretrained, **kwargs)
@@ -554,8 +657,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

- +
+ diff --git a/v0.3.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.3.0/_modules/doctr/models/recognition/master/tensorflow.html index 2dc5a27717..aa6aa69325 100644 --- a/v0.3.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.master.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import tensorflow as tf
-from tensorflow.keras import layers, Sequential, Model
-from typing import Tuple, List, Dict, Any, Optional
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
 
-from ..core import RecognitionPostProcessor
-from ...backbones.resnet import ResnetStage
-from ...utils import conv_sequence, load_pretrained_params
-from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask
-from ....datasets import VOCABS
-from .base import _MASTER, _MASTERPostProcessor
+import tensorflow as tf
+from tensorflow.keras import Model, layers
+
+from doctr.datasets import VOCABS
+from doctr.models.classification import magc_resnet31
+from doctr.models.modules.transformer import Decoder, PositionalEncoding
 
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from .base import _MASTER, _MASTERPostProcessor
 
-__all__ = ['MASTER', 'master', 'MASTERPostProcessor']
+__all__ = ["MASTER", "master"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'master': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'input_shape': (48, 160, 3),
-        'vocab': VOCABS['french'],
-        'url': None,
+    "master": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/master-d7fdaeff.weights.h5&src=0",
     },
 }
 
 
-class MAGC(layers.Layer):
-
-    """Implements the Multi-Aspect Global Context Attention, as described in
-    <https://arxiv.org/pdf/1910.02562.pdf>`_.
-
-    Args:
-        inplanes: input channels
-        headers: number of headers to split channels
-        att_scale: if True, re-scale attention to counteract the variance distibutions
-        **kwargs
-    """
-
-    def __init__(
-        self,
-        inplanes: int,
-        headers: int = 1,
-        att_scale: bool = False,
-        **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-
-        self.headers = headers  # h
-        self.inplanes = inplanes  # C
-        self.att_scale = att_scale
-
-        self.single_header_inplanes = int(inplanes / headers)  # C / h
-
-        self.conv_mask = tf.keras.layers.Conv2D(
-            filters=1,
-            kernel_size=1,
-            kernel_initializer=tf.initializers.he_normal()
-        )
-
-        self.transform = tf.keras.Sequential(
-            [
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-                tf.keras.layers.LayerNormalization([1, 2, 3]),
-                tf.keras.layers.ReLU(),
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-            ],
-            name='transform'
-        )
-
-    @tf.function
-    def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor:
-        b, h, w, c = (tf.shape(inputs)[i] for i in range(4))
-
-        # B, H, W, C -->> B*h, H, W, C/h
-        x = tf.reshape(inputs, shape=(b, h, w, self.headers, self.single_header_inplanes))
-        x = tf.transpose(x, perm=(0, 3, 1, 2, 4))
-        x = tf.reshape(x, shape=(b * self.headers, h, w, self.single_header_inplanes))
-
-        # Compute shorcut
-        shortcut = x
-        # B*h, 1, H*W, C/h
-        shortcut = tf.reshape(shortcut, shape=(b * self.headers, 1, h * w, self.single_header_inplanes))
-        # B*h, 1, C/h, H*W
-        shortcut = tf.transpose(shortcut, perm=[0, 1, 3, 2])
-
-        # Compute context mask
-        # B*h, H, W, 1,
-        context_mask = self.conv_mask(x)
-        # B*h, 1, H*W, 1
-        context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1))
-        # scale variance
-        if self.att_scale and self.headers > 1:
-            context_mask = context_mask / tf.sqrt(self.single_header_inplanes)
-        # B*h, 1, H*W, 1
-        context_mask = tf.keras.activations.softmax(context_mask, axis=2)
-
-        # Compute context
-        # B*h, 1, C/h, 1
-        context = tf.matmul(shortcut, context_mask)
-        context = tf.reshape(context, shape=(b, 1, c, 1))
-        # B, 1, 1, C
-        context = tf.transpose(context, perm=(0, 1, 3, 2))
-        # Set shape to resolve shape when calling this module in the Sequential MAGCResnet
-        batch, chan = inputs.get_shape().as_list()[0], inputs.get_shape().as_list()[-1]
-        context.set_shape([batch, 1, 1, chan])
-        return context
-
-    def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
-        # Context modeling: B, H, W, C  ->  B, 1, 1, C
-        context = self.context_modeling(inputs)
-        # Transform: B, 1, 1, C  ->  B, 1, 1, C
-        transformed = self.transform(context)
-        return inputs + transformed
-
-
-class MAGCResnet(Sequential):
-
-    """Implements the modified resnet with MAGC layers, as described in paper.
-
-    Args:
-        headers: number of header to split channels in MAGC layers
-        input_shape: shape of the model input (without batch dim)
-    """
-
-    def __init__(
-        self,
-        headers: int = 1,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
-    ) -> None:
-        _layers = [
-            # conv_1x
-            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
-            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_2x
-            ResnetStage(num_blocks=1, output_channels=256),
-            MAGC(inplanes=256, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=256, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_3x
-            ResnetStage(num_blocks=2, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 1), (2, 1)),
-            # conv_4x
-            ResnetStage(num_blocks=5, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            # conv_5x
-            ResnetStage(num_blocks=3, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-        ]
-        super().__init__(_layers)
-
-
 class MASTER(_MASTER, Model):
-
     """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
     Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_.
 
     Args:
+    ----
+        feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary, (without EOS, SOS, PAD)
         d_model: d parameter for the transformer decoder
-        headers: headers for the MAGC module
         dff: depth of the pointwise feed-forward layer
         num_heads: number of heads for the mutli-head attention module
         num_layers: number of decoder layers to stack
         max_length: maximum length of character sequence handled by the model
-        input_size: size of the image inputs
+        dropout: dropout probability of the decoder
+        input_shape: size of the image inputs
+        exportable: onnx exportable returns only logits
+        cfg: dictionary containing information about the model
     """
 
     def __init__(
         self,
+        feature_extractor: Model,
         vocab: str,
         d_model: int = 512,
-        headers: int = 1,
         dff: int = 2048,
-        num_heads: int = 8,
+        num_heads: int = 8,  # number of heads in the transformer decoder
         num_layers: int = 3,
         max_length: int = 50,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
+        dropout: float = 0.2,
+        input_shape: Tuple[int, int, int] = (32, 128, 3),  # different from the paper
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__()
 
-        self.vocab = vocab
+        self.exportable = exportable
         self.max_length = max_length
+        self.d_model = d_model
+        self.vocab = vocab
         self.cfg = cfg
         self.vocab_size = len(vocab)
 
-        self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape)
-        self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model)  # 3 more classes: EOS/PAD/SOS
+        self.feat_extractor = feature_extractor
+        self.positional_encoding = PositionalEncoding(self.d_model, dropout, max_len=input_shape[0] * input_shape[1])
 
         self.decoder = Decoder(
             num_layers=num_layers,
-            d_model=d_model,
+            d_model=self.d_model,
             num_heads=num_heads,
+            vocab_size=self.vocab_size + 3,  # EOS, SOS, PAD
             dff=dff,
-            vocab_size=self.vocab_size,
-            maximum_position_encoding=max_length,
+            dropout=dropout,
+            maximum_position_encoding=self.max_length,
         )
-        self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model)
-        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
 
+        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
         self.postprocessor = MASTERPostProcessor(vocab=self.vocab)
 
     @tf.function
-    def make_mask(self, target: tf.Tensor) -> tf.Tensor:
-        look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
-        target_padding_mask = create_padding_mask(target, self.vocab_size + 2)  # Pad symbol
-        combined_mask = tf.maximum(target_padding_mask, look_ahead_mask)
-        return combined_mask
+    def make_source_and_target_mask(self, source: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+        # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
+        # (N, 1, 1, max_length)
+        target_pad_mask = tf.cast(tf.math.not_equal(target, self.vocab_size + 2), dtype=tf.uint8)
+        target_pad_mask = target_pad_mask[:, tf.newaxis, tf.newaxis, :]
+        target_length = target.shape[1]
+        # sub mask filled diagonal with 1 = see 0 = masked (max_length, max_length)
+        target_sub_mask = tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
+        # source mask filled with ones (max_length, positional_encoded_seq_len)
+        source_mask = tf.ones((target_length, source.shape[1]))
+        # combine the two masks into one boolean mask where False is masked (N, 1, max_length, max_length)
+        target_mask = tf.math.logical_and(
+            tf.cast(target_sub_mask, dtype=tf.bool), tf.cast(target_pad_mask, dtype=tf.bool)
+        )
+        return source_mask, target_mask
 
+    @staticmethod
     def compute_loss(
-        self,
         model_output: tf.Tensor,
         gt: tf.Tensor,
         seq_len: List[int],
@@ -512,11 +413,13 @@ 

Source code for doctr.models.recognition.master.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -532,7 +435,7 @@

Source code for doctr.models.recognition.master.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) @@ -547,94 +450,103 @@

Source code for doctr.models.recognition.master.tensorflow

"""Call function for training Args: + ---- x: images target: list of str labels return_model_output: if True, return logits return_preds: if True, decode logits + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A dictionnary containing eventually loss, logits and predictions. """ - # Encode - feature = self.feature_extractor(x, **kwargs) - b, h, w, c = (tf.shape(feature)[i] for i in range(4)) + feature = self.feat_extractor(x, **kwargs) + b, h, w, c = feature.get_shape() + # (N, H, W, C) --> (N, H * W, C) feature = tf.reshape(feature, shape=(b, h * w, c)) - encoded = feature + self.feature_pe[:, :h * w, :] + # add positional encoding to features + encoded = self.positional_encoding(feature, **kwargs) out: Dict[str, tf.Tensor] = {} + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") + if target is not None: # Compute target: tensor of gts and sequence lengths - gt, seq_len = self.compute_target(target) - - if kwargs.get('training', False): - if target is None: - raise AssertionError("In training mode, you need to pass a value to 'target'") - tgt_mask = self.make_mask(gt) + gt, seq_len = self.build_target(target) + # Compute decoder masks + source_mask, target_mask = self.make_source_and_target_mask(encoded, gt) # Compute logits - output = self.decoder(gt, encoded, tgt_mask, None, **kwargs) + output = self.decoder(gt, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) - else: - # When not training, we want to compute logits in with the decoder, although - # we have access to gts (we need gts to compute the loss, but not in the decoder) logits = self.decode(encoded, **kwargs) + logits = _bf16_to_float32(logits) + + if self.exportable: + out["logits"] = logits + return out + if target is not None: - out['loss'] = self.compute_loss(logits, gt, seq_len) + out["loss"] = self.compute_loss(logits, gt, seq_len) if return_model_output: - out['out_map'] = logits + out["out_map"] = logits if return_preds: - predictions = self.postprocessor(logits) - out['preds'] = predictions + out["preds"] = self.postprocessor(logits) return out + @tf.function def decode(self, encoded: tf.Tensor, **kwargs: Any) -> tf.Tensor: """Decode function for prediction Args: + ---- encoded: encoded features + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A Tuple of tf.Tensor: predictions, logits """ - b = tf.shape(encoded)[0] - max_len = tf.constant(self.max_length, dtype=tf.int32) + b = encoded.shape[0] + start_symbol = tf.constant(self.vocab_size + 1, dtype=tf.int32) # SOS padding_symbol = tf.constant(self.vocab_size + 2, dtype=tf.int32) # PAD - ys = tf.fill(dims=(b, max_len - 1), value=padding_symbol) + ys = tf.fill(dims=(b, self.max_length - 1), value=padding_symbol) start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols - # max_len = len + 2 (sos + eos) + # Final dimension include EOS/SOS/PAD for i in range(self.max_length - 1): - ys_mask = self.make_mask(ys) - output = self.decoder(ys, encoded, ys_mask, None, **kwargs) + source_mask, target_mask = self.make_source_and_target_mask(encoded, ys) + output = self.decoder(ys, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) prob = tf.nn.softmax(logits, axis=-1) - next_word = tf.argmax(prob, axis=-1, output_type=ys.dtype) - # ys.shape = B, T - i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(max_len), indexing='ij') + next_token = tf.argmax(prob, axis=-1, output_type=ys.dtype) + # update ys with the next token and ignore the first token (SOS) + i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(self.max_length), indexing="ij") indices = tf.stack([i_mesh[:, i + 1], j_mesh[:, i + 1]], axis=1) - ys = tf.tensor_scatter_nd_update(ys, indices, next_word[:, i + 1]) + ys = tf.tensor_scatter_nd_update(ys, indices, next_token[:, i]) - # final_logits of shape (N, max_length - 1, vocab_size + 1) (whithout sos) + # Shape (N, max_length, vocab_size + 1) return logits class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures + Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -649,51 +561,66 @@

Source code for doctr.models.recognition.master.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _master(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> MASTER: +def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool = True, **kwargs: Any) -> MASTER: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"]) + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) - kwargs['vocab'] = _cfg['vocab'] + kwargs["vocab"] = _cfg["vocab"] + kwargs["input_shape"] = _cfg["input_shape"] # Build the model - model = MASTER(cfg=_cfg, **kwargs) + model = MASTER( + backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg["input_shape"], include_top=False), + cfg=_cfg, + **kwargs, + ) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model
-[docs] +[docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import master - >>> model = master(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + + >>> import tensorflow as tf + >>> from doctr.models import master + >>> model = master(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keywoard arguments passed to the MASTER architecture + Returns: + ------- text recognition architecture """ - - return _master('master', pretrained, **kwargs)
+ return _master("master", pretrained, magc_resnet31, **kwargs)
@@ -727,8 +654,8 @@

Source code for doctr.models.recognition.master.tensorflow

- +
+ diff --git a/v0.3.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.3.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/v0.3.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.3.0/_modules/doctr/models/recognition/sar.html b/v0.3.0/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.3.0/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.3.0/_modules/doctr/models/recognition/sar/tensorflow.html index e514e4f0c4..4a591e6451 100644 --- a/v0.3.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.sar.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
+
 import tensorflow as tf
-from tensorflow.keras import Sequential, layers, Model
-from typing import Tuple, Dict, List, Any, Optional
+from tensorflow.keras import Model, Sequential, layers
 
-from ... import backbones
-from ...utils import load_pretrained_params
-from ..core import RecognitionModel, RecognitionPostProcessor
+from doctr.datasets import VOCABS
 from doctr.utils.repr import NestedObject
 
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
+from ...classification import resnet31
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from ..core import RecognitionModel, RecognitionPostProcessor
+
+__all__ = ["SAR", "sar_resnet31"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
+    "sar_resnet31": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/sar_resnet31-5a58806c.weights.h5&src=0",
     },
 }
 
 
+class SAREncoder(layers.Layer, NestedObject):
+    """Implements encoder module of the SAR model
+
+    Args:
+    ----
+        rnn_units: number of hidden rnn units
+        dropout_prob: dropout probability
+    """
+
+    def __init__(self, rnn_units: int, dropout_prob: float = 0.0) -> None:
+        super().__init__()
+        self.rnn = Sequential([
+            layers.LSTM(units=rnn_units, return_sequences=True, recurrent_dropout=dropout_prob),
+            layers.LSTM(units=rnn_units, return_sequences=False, recurrent_dropout=dropout_prob),
+        ])
+
+    def call(
+        self,
+        x: tf.Tensor,
+        **kwargs: Any,
+    ) -> tf.Tensor:
+        # (N, C)
+        return self.rnn(x, **kwargs)
+
+
 class AttentionModule(layers.Layer, NestedObject):
     """Implements attention module of the SAR model
 
     Args:
+    ----
         attention_units: number of hidden attention units
 
     """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
 
+    def __init__(self, attention_units: int) -> None:
         super().__init__()
         self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            3,
+            strides=1,
+            use_bias=True,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
+            1,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.flatten = layers.Flatten()
 
@@ -343,12 +395,12 @@ 

Source code for doctr.models.recognition.sar.tensorflow

hidden_state: tf.Tensor, **kwargs: Any, ) -> tf.Tensor: - [H, W] = features.get_shape().as_list()[1:3] - # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) - hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) # shape (N, H, W, vgg_units) -> (N, H, W, attention_units) features_projection = self.features_projector(features, **kwargs) + # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) + hidden_state = tf.expand_dims(tf.expand_dims(hidden_state, axis=1), axis=1) + hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) projection = tf.math.tanh(hidden_state_projection + features_projection) # shape (N, H, W, attention_units) -> (N, H, W, 1) attention = self.attention_projector(projection, **kwargs) @@ -358,23 +410,25 @@

Source code for doctr.models.recognition.sar.tensorflow

# shape (N, H * W) -> (N, H, W, 1) attention_map = tf.reshape(attention, [-1, H, W, 1]) glimpse = tf.math.multiply(features, attention_map) - # shape (N, H * W) -> (N, 1) - glimpse = tf.reduce_sum(glimpse, axis=[1, 2]) - return glimpse + # shape (N, H * W) -> (N, C) + return tf.reduce_sum(glimpse, axis=[1, 2]) class SARDecoder(layers.Layer, NestedObject): """Implements decoder module of the SAR model Args: + ---- rnn_units: number of hidden units in recurrent cells max_length: maximum length of a sequence vocab_size: number of classes in the model alphabet embedding_units: number of hidden embedding units attention_units: number of hidden attention units - num_decoder_layers: number of LSTM layers to stack + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability """ + def __init__( self, rnn_units: int, @@ -382,23 +436,22 @@

Source code for doctr.models.recognition.sar.tensorflow

vocab_size: int, embedding_units: int, attention_units: int, - num_decoder_layers: int = 2, - input_shape: Optional[List[Tuple[Optional[int]]]] = None, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, ) -> None: - super().__init__() self.vocab_size = vocab_size - self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] - ) - self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) - self.attention_module = AttentionModule(attention_units) - self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units)) self.max_length = max_length - # Initialize kernels - if input_shape is not None: - self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units))) + self.embed = layers.Dense(embedding_units, use_bias=False) + self.embed_tgt = layers.Embedding(embedding_units, self.vocab_size + 1) + + self.lstm_cells = layers.StackedRNNCells([ + layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_cells) + ]) + self.attention_module = AttentionModule(attention_units) + self.output_dense = layers.Dense(self.vocab_size + 1, use_bias=True) + self.dropout = layers.Dropout(dropout_prob) def call( self, @@ -407,40 +460,47 @@

Source code for doctr.models.recognition.sar.tensorflow

gt: Optional[tf.Tensor] = None, **kwargs: Any, ) -> tf.Tensor: - - # initialize states (each of shape (N, rnn_units)) - states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=tf.float32 - ) - # run first step of lstm - # holistic: shape (N, rnn_units) - _, states = self.lstm_decoder(holistic, states, **kwargs) - # Initialize with the index of virtual START symbol (placed after <eos>) - symbol = tf.fill(features.shape[0], self.vocab_size + 1) - logits_list = [] - if kwargs.get('training') and gt is None: - raise ValueError('Need to provide labels during training for teacher forcing') - for t in range(self.max_length + 1): # keep 1 step for <eos> - # one-hot symbol with depth vocab_size + 1 - # embeded_symbol: shape (N, embedding_units) - embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs) - logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs) - glimpse = self.attention_module( - features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs, - ) - # logits: shape (N, rnn_units), glimpse: shape (N, 1) - logits = tf.concat([logits, glimpse], axis=-1) - # shape (N, rnn_units + 1) -> (N, vocab_size + 1) - logits = self.output_dense(logits, **kwargs) - # update symbol with predicted logits for t+1 step - if kwargs.get('training'): - symbol = gt[:, t] # type: ignore[index] + if gt is not None: + gt_embedding = self.embed_tgt(gt, **kwargs) + + logits_list: List[tf.Tensor] = [] + + for t in range(self.max_length + 1): # 32 + if t == 0: + # step to init the first states of the LSTMCell + states = self.lstm_cells.get_initial_state( + inputs=None, batch_size=features.shape[0], dtype=features.dtype + ) + prev_symbol = holistic + elif t == 1: + # step to init a 'blank' sequence of length vocab_size + 1 filled with zeros + # (N, vocab_size + 1) --> (N, embedding_units) + prev_symbol = tf.zeros([features.shape[0], self.vocab_size + 1], dtype=features.dtype) + prev_symbol = self.embed(prev_symbol, **kwargs) else: - symbol = tf.argmax(logits, axis=-1) - logits_list.append(logits) - outputs = tf.stack(logits_list, axis=1) # shape (N, max_length + 1, vocab_size + 1) - - return outputs + if gt is not None and kwargs.get("training", False): + # (N, embedding_units) -2 because of <bos> and <eos> (same) + prev_symbol = self.embed(gt_embedding[:, t - 2], **kwargs) + else: + # -1 to start at timestep where prev_symbol was initialized + index = tf.argmax(logits_list[t - 1], axis=-1) + # update prev_symbol with ones at the index of the previous logit vector + prev_symbol = self.embed(self.embed_tgt(index, **kwargs), **kwargs) + + # (N, C), (N, C) take the last hidden state and cell state from current timestep + _, states = self.lstm_cells(prev_symbol, states, **kwargs) + # states = (hidden_state, cell_state) + hidden_state = states[0][0] + # (N, H, W, C), (N, C) --> (N, C) + glimpse = self.attention_module(features, hidden_state, **kwargs) + # (N, C), (N, C) --> (N, 2 * C) + logits = tf.concat([hidden_state, glimpse], axis=1) + logits = self.dropout(logits, **kwargs) + # (N, vocab_size + 1) + logits_list.append(self.output_dense(logits, **kwargs)) + + # (max_length + 1, N, vocab_size + 1) --> (N, max_length + 1, vocab_size + 1) + return tf.transpose(tf.stack(logits_list[1:]), (1, 0, 2)) class SAR(Model, RecognitionModel): @@ -448,17 +508,20 @@

Source code for doctr.models.recognition.sar.tensorflow

Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of hidden units in both encoder and decoder LSTM embedding_units: number of embedding units attention_units: number of hidden units in attention module max_length: maximum word length handled by the model - num_decoders: number of LSTM to stack in decoder layer - + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability for the encoder and decoder + exportable: onnx exportable returns only logits + cfg: dictionary containing information about the model """ - _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "encoder", "decoder", "postprocessor"] def __init__( self, @@ -468,36 +531,34 @@

Source code for doctr.models.recognition.sar.tensorflow

embedding_units: int = 512, attention_units: int = 512, max_length: int = 30, - num_decoders: int = 2, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: - super().__init__() self.vocab = vocab + self.exportable = exportable self.cfg = cfg - self.max_length = max_length + 1 # Add 1 timestep for EOS after the longest word self.feat_extractor = feature_extractor - self.encoder = Sequential( - [ - layers.LSTM(units=rnn_units, return_sequences=True), - layers.LSTM(units=rnn_units, return_sequences=False) - ] - ) - # Initialize the kernels (watch out for reduce_max) - self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:]) - + self.encoder = SAREncoder(rnn_units, dropout_prob) self.decoder = SARDecoder( - rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders, - input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape] + rnn_units, + self.max_length, + len(vocab), + embedding_units, + attention_units, + num_decoder_cells, + dropout_prob, ) self.postprocessor = SARPostProcessor(vocab=vocab) + @staticmethod def compute_loss( - self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: tf.Tensor, @@ -506,11 +567,13 @@

Source code for doctr.models.recognition.sar.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -525,7 +588,7 @@

Source code for doctr.models.recognition.sar.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -536,16 +599,28 @@

Source code for doctr.models.recognition.sar.tensorflow

return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - features = self.feat_extractor(x, **kwargs) - pooled_features = tf.reduce_max(features, axis=1) # vertical max pooling + # vertical max pooling --> (N, C, W) + pooled_features = tf.reduce_max(features, axis=1) + # holistic (N, C) encoded = self.encoder(pooled_features, **kwargs) + if target is not None: - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) seq_len = tf.cast(seq_len, tf.int32) - decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training for teacher forcing") + + decoded_features = _bf16_to_float32( + self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + ) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = decoded_features + return out + if return_model_output: out["out_map"] = decoded_features @@ -554,7 +629,7 @@

Source code for doctr.models.recognition.sar.tensorflow

out["preds"] = self.postprocessor(decoded_features) if target is not None: - out['loss'] = self.compute_loss(decoded_features, gt, seq_len) + out["loss"] = self.compute_loss(decoded_features, gt, seq_len) return out @@ -563,9 +638,8 @@

Source code for doctr.models.recognition.sar.tensorflow

"""Post processor for SAR architectures Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -580,95 +654,75 @@

Source code for doctr.models.recognition.sar.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: +def _sar( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> SAR: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) - _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units']) - _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units']) - _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length']) - _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) # Feature extractor - feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + pretrained=pretrained_backbone, + input_shape=_cfg["input_shape"], include_top=False, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - kwargs['embedding_units'] = _cfg['embedding_units'] - kwargs['attention_units'] = _cfg['attention_units'] - kwargs['max_length'] = _cfg['max_length'] - kwargs['num_decoders'] = _cfg['num_decoders'] + kwargs["vocab"] = _cfg["vocab"] # Build the model model = SAR(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model -
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - -
-[docs] +[docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import sar_resnet31 + >>> model = sar_resnet31(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the SAR architecture Returns: + ------- text recognition architecture """ - - return _sar('sar_resnet31', pretrained, **kwargs)
+ return _sar("sar_resnet31", pretrained, resnet31, **kwargs)
@@ -702,8 +756,8 @@

Source code for doctr.models.recognition.sar.tensorflow

- +
+ diff --git a/v0.3.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.3.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/v0.3.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.3.0/_modules/doctr/models/recognition/zoo.html b/v0.3.0/_modules/doctr/models/recognition/zoo.html index bf0ae6af6e..f664304019 100644 --- a/v0.3.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.3.0/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
-from doctr.file_utils import is_tf_available, is_torch_available
-from .core import RecognitionPredictor
-from ..preprocessor import PreProcessor
-from .. import recognition
+from doctr.file_utils import is_tf_available
+from doctr.models.preprocessor import PreProcessor
 
+from .. import recognition
+from .predictor import RecognitionPredictor
 
 __all__ = ["recognition_predictor"]
 
 
-if is_tf_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
-elif is_torch_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
+ARCHS: List[str] = [
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "sar_resnet31",
+    "master",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
+]
+
 
+def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+        _model = recognition.__dict__[arch](
+            pretrained=pretrained, pretrained_backbone=kwargs.get("pretrained_backbone", True)
+        )
+    else:
+        if not isinstance(
+            arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq)
+        ):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+        _model = arch
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+    kwargs.pop("pretrained_backbone", None)
 
-    _model = recognition.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 32)
-    predictor = RecognitionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
-        _model
-    )
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 128)
+    input_shape = _model.cfg["input_shape"][:2] if is_tf_available() else _model.cfg["input_shape"][-2:]
+    predictor = RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs), _model)
 
     return predictor
 
 
 
-[docs] -def recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor: +[docs] +def recognition_predictor( + arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + symmetric_pad: bool = False, + batch_size: int = 128, + **kwargs: Any, +) -> RecognitionPredictor: """Text recognition architecture. Example:: @@ -326,14 +369,18 @@

Source code for doctr.models.recognition.zoo

        >>> out = model([input_page])
 
     Args:
-        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
+    ----
+        arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
         pretrained: If True, returns a model pre-trained on our text recognition dataset
+        symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right
+        batch_size: number of samples the model processes in parallel
+        **kwargs: optional parameters to be passed to the architecture
 
     Returns:
+    -------
         Recognition predictor
     """
-
-    return _predictor(arch, pretrained, **kwargs)
+ return _predictor(arch=arch, pretrained=pretrained, symmetric_pad=symmetric_pad, batch_size=batch_size, **kwargs)
@@ -367,8 +414,8 @@

Source code for doctr.models.recognition.zoo

   
-
- +
+ diff --git a/v0.3.0/_modules/doctr/models/zoo.html b/v0.3.0/_modules/doctr/models/zoo.html index dec6857019..d459671648 100644 --- a/v0.3.0/_modules/doctr/models/zoo.html +++ b/v0.3.0/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.models.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from typing import Any
-from .core import OCRPredictor
+
 from .detection.zoo import detection_predictor
+from .kie_predictor import KIEPredictor
+from .predictor import OCRPredictor
 from .recognition.zoo import recognition_predictor
 
+__all__ = ["ocr_predictor", "kie_predictor"]
 
-__all__ = ["ocr_predictor"]
-
-
-def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
 
+def _predictor(
+    det_arch: Any,
+    reco_arch: Any,
+    pretrained: bool,
+    pretrained_backbone: bool = True,
+    assume_straight_pages: bool = True,
+    preserve_aspect_ratio: bool = True,
+    symmetric_pad: bool = True,
+    det_bs: int = 2,
+    reco_bs: int = 128,
+    detect_orientation: bool = False,
+    straighten_pages: bool = False,
+    detect_language: bool = False,
+    **kwargs,
+) -> OCRPredictor:
     # Detection
-    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
+    det_predictor = detection_predictor(
+        det_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=det_bs,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+    )
 
     # Recognition
-    reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
+    reco_predictor = recognition_predictor(
+        reco_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=reco_bs,
+    )
 
-    return OCRPredictor(det_predictor, reco_predictor)
+    return OCRPredictor(
+        det_predictor,
+        reco_predictor,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+        detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
+        detect_language=detect_language,
+        **kwargs,
+    )
 
 
 
-[docs] +[docs] def ocr_predictor( - det_arch: str = 'db_resnet50', - reco_arch: str = 'crnn_vgg16_bn', + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", pretrained: bool = False, - **kwargs: Any + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. - Example:: - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet') + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` Returns: + ------- OCR predictor """ + return _predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
+ + - return _predictor(det_arch, reco_arch, pretrained, **kwargs)
+def _kie_predictor( + det_arch: Any, + reco_arch: Any, + pretrained: bool, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + det_bs: int = 2, + reco_bs: int = 128, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs, +) -> KIEPredictor: + # Detection + det_predictor = detection_predictor( + det_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=det_bs, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + ) + + # Recognition + reco_predictor = recognition_predictor( + reco_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=reco_bs, + ) + + return KIEPredictor( + det_predictor, + reco_predictor, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + ) + + +
+[docs] +def kie_predictor( + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, +) -> KIEPredictor: + """End-to-end KIE architecture using one model for localization, and another for text recognition. + + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) + + Args: + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') + pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` + + Returns: + ------- + KIE predictor + """ + return _kie_predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
@@ -353,8 +575,8 @@

Source code for doctr.models.zoo

       
     
   
- - + + diff --git a/v0.3.0/_modules/doctr/transforms/modules.html b/v0.3.0/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.3.0/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/_modules/doctr/transforms/modules/base.html b/v0.3.0/_modules/doctr/transforms/modules/base.html index c42079a8fd..4596df3848 100644 --- a/v0.3.0/_modules/doctr/transforms/modules/base.html +++ b/v0.3.0/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.base

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
+import math
 import random
-from typing import List, Any, Callable
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import numpy as np
 
 from doctr.utils.repr import NestedObject
+
 from .. import functional as F
 
+__all__ = ["SampleCompose", "ImageTransform", "ColorInversion", "OneOf", "RandomApply", "RandomRotate", "RandomCrop"]
+
+
+class SampleCompose(NestedObject):
+    """Implements a wrapper that will apply transformations sequentially on both image and target
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfo = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), np.zeros((2, 4)))
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import torch
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfos = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfos(torch.rand(8, 64, 64, 3), np.zeros((2, 4)))
+
+    Args:
+    ----
+        transforms: list of transformation modules
+    """
+
+    _children_names: List[str] = ["sample_transforms"]
+
+    def __init__(self, transforms: List[Callable[[Any, Any], Tuple[Any, Any]]]) -> None:
+        self.sample_transforms = transforms
+
+    def __call__(self, x: Any, target: Any) -> Tuple[Any, Any]:
+        for t in self.sample_transforms:
+            x, target = t(x, target)
+
+        return x, target
+
+
+class ImageTransform(NestedObject):
+    """Implements a transform wrapper to turn an image-only transformation into an image+target transform
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), None)
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import torch
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(torch.rand(8, 64, 64, 3), None)
+
+    Args:
+    ----
+        transform: the image transformation module to wrap
+    """
+
+    _children_names: List[str] = ["img_transform"]
+
+    def __init__(self, transform: Callable[[Any], Any]) -> None:
+        self.img_transform = transform
 
-__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
+    def __call__(self, img: Any, target: Any) -> Tuple[Any, Any]:
+        img = self.img_transform(img)
+        return img, target
 
 
 
-[docs] +[docs] class ColorInversion(NestedObject): """Applies the following tranformation to a tensor (image or batch of images): convert to grayscale, colorize (shift 0-values randomly), and then invert colors - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(torch.rand(8, 64, 64, 3)) Args: + ---- min_val: range [min_val, 1] to colorize RGB pixels """ + def __init__(self, min_val: float = 0.5) -> None: self.min_val = min_val @@ -316,59 +437,178 @@

Source code for doctr.transforms.modules.base

-[docs] +[docs] class OneOf(NestedObject): """Randomly apply one of the input transformations - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transforms: list of transformations, one only will be picked """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: # Pick transformation transfo = self.transforms[int(random.random() * len(self.transforms))] # Apply - return transfo(img)
+ return transfo(img) if target is None else transfo(img, target) # type: ignore[call-arg]
-[docs] +[docs] class RandomApply(NestedObject): """Apply with a probability p the input transformation - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transform: transformation to apply p: probability to apply """ - def __init__(self, transform: Callable[[Any], Any], p: float = .5) -> None: + + def __init__(self, transform: Callable[[Any], Any], p: float = 0.5) -> None: self.transform = transform self.p = p def extra_repr(self) -> str: return f"transform={self.transform}, p={self.p}" - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: if random.random() < self.p: - return self.transform(img) - return img
+ return self.transform(img) if target is None else self.transform(img, target) # type: ignore[call-arg] + return img if target is None else (img, target)
+ + + +
+[docs] +class RandomRotate(NestedObject): + """Randomly rotate a tensor image and its boxes + + .. image:: https://doctr-static.mindee.com/models?id=v0.4.0/rotation_illustration.png&src=0 + :align: center + + Args: + ---- + max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in + [-max_angle, max_angle] + expand: whether the image should be padded before the rotation + """ + + def __init__(self, max_angle: float = 5.0, expand: bool = False) -> None: + self.max_angle = max_angle + self.expand = expand + + def extra_repr(self) -> str: + return f"max_angle={self.max_angle}, expand={self.expand}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + angle = random.uniform(-self.max_angle, self.max_angle) + r_img, r_polys = F.rotate_sample(img, target, angle, self.expand) + # Removes deleted boxes + is_kept = (r_polys.max(1) > r_polys.min(1)).sum(1) == 2 + return r_img, r_polys[is_kept]
+ + + +
+[docs] +class RandomCrop(NestedObject): + """Randomly crop a tensor image and its boxes + + Args: + ---- + scale: tuple of floats, relative (min_area, max_area) of the crop + ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w + """ + + def __init__(self, scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: + self.scale = scale + self.ratio = ratio + + def extra_repr(self) -> str: + return f"scale={self.scale}, ratio={self.ratio}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + scale = random.uniform(self.scale[0], self.scale[1]) + ratio = random.uniform(self.ratio[0], self.ratio[1]) + + height, width = img.shape[:2] + + # Calculate crop size + crop_area = scale * width * height + aspect_ratio = ratio * (width / height) + crop_width = int(round(math.sqrt(crop_area * aspect_ratio))) + crop_height = int(round(math.sqrt(crop_area / aspect_ratio))) + + # Ensure crop size does not exceed image dimensions + crop_width = min(crop_width, width) + crop_height = min(crop_height, height) + + # Randomly select crop position + x = random.randint(0, width - crop_width) + y = random.randint(0, height - crop_height) + + # relative crop box + crop_box = (x / width, y / height, (x + crop_width) / width, (y + crop_height) / height) + if target.shape[1:] == (4, 2): + min_xy = np.min(target, axis=1) + max_xy = np.max(target, axis=1) + _target = np.concatenate((min_xy, max_xy), axis=1) + else: + _target = target + + # Crop image and targets + croped_img, crop_boxes = F.crop_detection(img, _target, crop_box) + # hard fallback if no box is kept + if crop_boxes.shape[0] == 0: + return img, target + # clip boxes + return croped_img, np.clip(crop_boxes, 0, 1)
@@ -402,8 +642,8 @@

Source code for doctr.transforms.modules.base

- - + + diff --git a/v0.3.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.3.0/_modules/doctr/transforms/modules/tensorflow.html index 1d192a876b..acbbe96225 100644 --- a/v0.3.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.3.0/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import random
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
 import tensorflow as tf
-from typing import List, Any, Tuple, Callable
 
 from doctr.utils.repr import NestedObject
 
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'RandomBrightness',
-           'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality']
+from ..functional.tensorflow import _gaussian_filter, random_shadow
+
+__all__ = [
+    "Compose",
+    "Resize",
+    "Normalize",
+    "LambdaTransformation",
+    "ToGray",
+    "RandomBrightness",
+    "RandomContrast",
+    "RandomSaturation",
+    "RandomHue",
+    "RandomGamma",
+    "RandomJpegQuality",
+    "GaussianBlur",
+    "ChannelShuffle",
+    "GaussianNoise",
+    "RandomHorizontalFlip",
+    "RandomShadow",
+    "RandomResize",
+]
 
 
 
-[docs] +[docs] class Compose(NestedObject): """Implements a wrapper that will apply transformations sequentially - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Compose, Resize + >>> transfos = Compose([Resize((32, 32))]) + >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- transforms: list of transformation modules """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms @@ -319,26 +361,27 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class Resize(NestedObject): """Resizes a tensor to a target size - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Resize + >>> transfo = Resize((32, 32)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- output_size: expected output size method: interpolation method preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically """ + def __init__( self, - output_size: Tuple[int, int], - method: str = 'bilinear', + output_size: Union[int, Tuple[int, int]], + method: str = "bilinear", preserve_aspect_ratio: bool = False, symmetric_pad: bool = False, ) -> None: @@ -346,6 +389,14 @@

Source code for doctr.transforms.modules.tensorflow

self.method = method self.preserve_aspect_ratio = preserve_aspect_ratio self.symmetric_pad = symmetric_pad + self.antialias = True + + if isinstance(self.output_size, int): + self.wanted_size = (self.output_size, self.output_size) + elif isinstance(self.output_size, (tuple, list)): + self.wanted_size = self.output_size + else: + raise AssertionError("Output size should be either a list, a tuple or an int") def extra_repr(self) -> str: _repr = f"output_size={self.output_size}, method='{self.method}'" @@ -353,64 +404,106 @@

Source code for doctr.transforms.modules.tensorflow

_repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" return _repr - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) + def __call__( + self, + img: tf.Tensor, + target: Optional[np.ndarray] = None, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: + input_dtype = img.dtype + self.output_size = ( + (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size + ) + + img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias) + # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio + raw_shape = img.shape[:2] + if self.symmetric_pad: + half_pad = (int((self.output_size[0] - img.shape[0]) / 2), 0) if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
+ if isinstance(self.output_size, (tuple, list)): + # In that case we need to pad because we want to enforce both width and height + if not self.symmetric_pad: + half_pad = (0, 0) + elif self.output_size[0] == img.shape[0]: + half_pad = (0, int((self.output_size[1] - img.shape[1]) / 2)) + # Pad image + img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size) + + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) + if target is not None: + if self.symmetric_pad: + offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1] + + if self.preserve_aspect_ratio: + # Get absolute coords + if target.shape[1:] == (4,): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1] + target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0] + else: + target[:, [0, 2]] *= raw_shape[1] / img.shape[1] + target[:, [1, 3]] *= raw_shape[0] / img.shape[0] + elif target.shape[1:] == (4, 2): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1] + target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0] + else: + target[..., 0] *= raw_shape[1] / img.shape[1] + target[..., 1] *= raw_shape[0] / img.shape[0] + else: + raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") + + return tf.cast(img, dtype=input_dtype), np.clip(target, 0, 1) + + return tf.cast(img, dtype=input_dtype)
-[docs] +[docs] class Normalize(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Normalize + >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- mean: average value per channel std: standard deviation per channel """ + def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) + self.mean = tf.constant(mean) + self.std = tf.constant(std) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std + img -= tf.cast(self.mean, dtype=img.dtype) + img /= tf.cast(self.std, dtype=img.dtype) return img
-[docs] +[docs] class LambdaTransformation(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import LambdaTransformation + >>> transfo = LambdaTransformation(lambda x: x/ 255.) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- fn: the function to be applied to the input tensor """ + def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: self.fn = fn @@ -420,37 +513,42 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class ToGray(NestedObject): """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import ToGray + >>> transfo = ToGray() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) """ + + def __init__(self, num_output_channels: int = 1): + self.num_output_channels = num_output_channels + def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
+ img = tf.image.rgb_to_grayscale(img) + return img if self.num_output_channels == 1 else tf.repeat(img, self.num_output_channels, axis=-1)
-[docs] +[docs] class RandomBrightness(NestedObject): """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta to all pixels - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomBrightness + >>> transfo = RandomBrightness() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] p: probability to apply transformation """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -463,21 +561,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomContrast(NestedObject): """Randomly adjust contrast of a tensor (batch of images or image) by adjusting each pixel: (img - mean) * contrast_factor + mean. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomContrast + >>> transfo = RandomContrast() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) """ - def __init__(self, delta: float = .3) -> None: + + def __init__(self, delta: float = 0.3) -> None: self.delta = delta def extra_repr(self) -> str: @@ -489,21 +588,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomSaturation(NestedObject): """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and increasing saturation by a factor. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomSaturation + >>> transfo = RandomSaturation() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) """ - def __init__(self, delta: float = .5) -> None: + + def __init__(self, delta: float = 0.5) -> None: self.delta = delta def extra_repr(self) -> str: @@ -515,19 +615,20 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomHue(NestedObject): """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHue + >>> transfo = RandomHue() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -540,22 +641,23 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomGamma(NestedObject): """randomly performs gamma correction for a tensor (batch of images or image) - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomGamma + >>> transfo = RandomGamma() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- min_gamma: non-negative real number, lower bound for gamma param max_gamma: non-negative real number, upper bound for gamma min_gain: lower bound for constant multiplier max_gain: upper bound for constant multiplier """ + def __init__( self, min_gamma: float = 0.5, @@ -580,20 +682,21 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomJpegQuality(NestedObject): """Randomly adjust jpeg quality of a 3 dimensional RGB image - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomJpegQuality + >>> transfo = RandomJpegQuality() + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- min_quality: int between [0, 100] max_quality: int between [0, 100] """ + def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: self.min_quality = min_quality self.max_quality = max_quality @@ -602,10 +705,224 @@

Source code for doctr.transforms.modules.tensorflow

return f"min_quality={self.min_quality}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality + return tf.image.random_jpeg_quality(img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality)
+ + + +
+[docs] +class GaussianBlur(NestedObject): + """Randomly adjust jpeg quality of a 3 dimensional RGB image + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianBlur + >>> transfo = GaussianBlur(3, (.1, 5)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + kernel_shape: size of the blurring kernel + std: min and max value of the standard deviation + """ + + def __init__(self, kernel_shape: Union[int, Iterable[int]], std: Tuple[float, float]) -> None: + self.kernel_shape = kernel_shape + self.std = std + + def extra_repr(self) -> str: + return f"kernel_shape={self.kernel_shape}, std={self.std}" + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.squeeze( + _gaussian_filter( + img[tf.newaxis, ...], + kernel_size=self.kernel_shape, + sigma=random.uniform(self.std[0], self.std[1]), + mode="REFLECT", + ), + axis=0, )
+ + +
+[docs] +class ChannelShuffle(NestedObject): + """Randomly shuffle channel order of a given image""" + + def __init__(self): + pass + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.transpose(tf.random.shuffle(tf.transpose(img, perm=[2, 0, 1])), perm=[1, 2, 0])
+ + + +
+[docs] +class GaussianNoise(NestedObject): + """Adds Gaussian Noise to the input tensor + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianNoise + >>> transfo = GaussianNoise(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + mean : mean of the gaussian distribution + std : std of the gaussian distribution + """ + + def __init__(self, mean: float = 0.0, std: float = 1.0) -> None: + super().__init__() + self.std = std + self.mean = mean + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + noise = self.mean + 2 * self.std * tf.random.uniform(x.shape) - self.std + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value(tf.math.round(tf.cast(x, dtype=tf.float32) + 255 * noise), 0, 255), dtype=tf.uint8 + ) + else: + return tf.cast(tf.clip_by_value(x + noise, 0, 1), dtype=x.dtype) + + def extra_repr(self) -> str: + return f"mean={self.mean}, std={self.std}"
+ + + +
+[docs] +class RandomHorizontalFlip(NestedObject): + """Adds random horizontal flip to the input tensor/np.ndarray + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHorizontalFlip + >>> transfo = RandomHorizontalFlip(p=0.5) + >>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1) + >>> target = np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32) + >>> out = transfo(image, target) + + Args: + ---- + p : probability of Horizontal Flip + """ + + def __init__(self, p: float) -> None: + super().__init__() + self.p = p + + def __call__(self, img: Union[tf.Tensor, np.ndarray], target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + _img = tf.image.flip_left_right(img) + _target = target.copy() + # Changing the relative bbox coordinates + if target.shape[1:] == (4,): + _target[:, ::2] = 1 - target[:, [2, 0]] + else: + _target[..., 0] = 1 - target[..., 0] + return _img, _target + return img, target
+ + + +
+[docs] +class RandomShadow(NestedObject): + """Adds random shade to the input image + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomShadow + >>> transfo = RandomShadow(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + opacity_range : minimum and maximum opacity of the shade + """ + + def __init__(self, opacity_range: Optional[Tuple[float, float]] = None) -> None: + super().__init__() + self.opacity_range = opacity_range if isinstance(opacity_range, tuple) else (0.2, 0.8) + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value( + tf.math.round(255 * random_shadow(tf.cast(x, dtype=tf.float32) / 255, self.opacity_range)), + 0, + 255, + ), + dtype=tf.uint8, + ) + else: + return tf.clip_by_value(random_shadow(x, self.opacity_range), 0, 1) + + def extra_repr(self) -> str: + return f"opacity_range={self.opacity_range}"
+ + + +
+[docs] +class RandomResize(NestedObject): + """Randomly resize the input image and align corresponding targets + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomResize + >>> transfo = RandomResize((0.3, 0.9), preserve_aspect_ratio=True, symmetric_pad=True, p=0.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + scale_range: range of the resizing factor for width and height (independently) + preserve_aspect_ratio: whether to preserve the aspect ratio of the image, + given a float value, the aspect ratio will be preserved with this probability + symmetric_pad: whether to symmetrically pad the image, + given a float value, the symmetric padding will be applied with this probability + p: probability to apply the transformation + """ + + def __init__( + self, + scale_range: Tuple[float, float] = (0.3, 0.9), + preserve_aspect_ratio: Union[bool, float] = False, + symmetric_pad: Union[bool, float] = False, + p: float = 0.5, + ): + super().__init__() + self.scale_range = scale_range + self.preserve_aspect_ratio = preserve_aspect_ratio + self.symmetric_pad = symmetric_pad + self.p = p + self._resize = Resize + + def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + scale_h = random.uniform(*self.scale_range) + scale_w = random.uniform(*self.scale_range) + new_size = (int(img.shape[-3] * scale_h), int(img.shape[-2] * scale_w)) + + _img, _target = self._resize( + new_size, + preserve_aspect_ratio=self.preserve_aspect_ratio + if isinstance(self.preserve_aspect_ratio, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + symmetric_pad=self.symmetric_pad + if isinstance(self.symmetric_pad, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + )(img, target) + + return _img, _target + return img, target + + def extra_repr(self) -> str: + return f"scale_range={self.scale_range}, preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}, p={self.p}" # noqa: E501
+
@@ -638,8 +955,8 @@

Source code for doctr.transforms.modules.tensorflow

- +
+ diff --git a/v0.3.0/_modules/doctr/utils/metrics.html b/v0.3.0/_modules/doctr/utils/metrics.html index 460c64a385..8a37d5949a 100644 --- a/v0.3.0/_modules/doctr/utils/metrics.html +++ b/v0.3.0/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.metrics

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-import cv2
-from typing import List, Tuple, Dict, Optional
-from unidecode import unidecode
+from anyascii import anyascii
 from scipy.optimize import linear_sum_assignment
-from doctr.utils.geometry import rbbox_to_polygon
+from shapely.geometry import Polygon
 
-__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'rbox_to_mask',
-           'nms', 'LocalizationConfusion', 'OCRMetric']
+__all__ = [
+    "TextMatch",
+    "box_iou",
+    "polygon_iou",
+    "nms",
+    "LocalizationConfusion",
+    "OCRMetric",
+    "DetectionMetric",
+]
 
 
 def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
-    """Perform string comparison with multiple levels of tolerance
+    """Performs string comparison with multiple levels of tolerance
 
     Args:
+    ----
         word1: a string
         word2: another string
 
     Returns:
+    -------
         a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
-            unidecode counterparts and their lower-case unidecode counterparts match
+            anyascii counterparts and their lower-case anyascii counterparts match
     """
-    raw_match = (word1 == word2)
-    caseless_match = (word1.lower() == word2.lower())
-    unidecode_match = (unidecode(word1) == unidecode(word2))
+    raw_match = word1 == word2
+    caseless_match = word1.lower() == word2.lower()
+    anyascii_match = anyascii(word1) == anyascii(word2)
 
     # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
-    unicase_match = (unidecode(word1).lower() == unidecode(word2).lower())
+    unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
 
-    return raw_match, caseless_match, unidecode_match, unicase_match
+    return raw_match, caseless_match, anyascii_match, unicase_match
 
 
 
-[docs] +[docs] class TextMatch: - """Implements text match metric (word-level accuracy) for recognition task. + r"""Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \\forall X, Y \\in \\mathcal{W}^N, - TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) + \forall X, Y \in \mathcal{W}^N, + TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \\forall a, x \\in \\mathcal{W}, - f_a(x) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } x = a \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{W}` is the set of all possible character sequences, + \forall a, x \in \mathcal{W}, + f_a(x) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } x = a \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. - Example:: - >>> from doctr.utils import TextMatch - >>> metric = TextMatch() - >>> metric.update(['Hello', 'world'], ['hello', 'world']) - >>> metric.summary() + >>> from doctr.utils import TextMatch + >>> metric = TextMatch() + >>> metric.update(['Hello', 'world'], ['hello', 'world']) + >>> metric.summary() """ def __init__(self) -> None: self.reset() +
+[docs] def update( self, gt: List[str], @@ -354,29 +386,32 @@

Source code for doctr.utils.metrics

         """Update the state of the metric with new predictions
 
         Args:
+        ----
             gt: list of groung-truth character sequences
-            pred: list of predicted character sequences"""
-
+            pred: list of predicted character sequences
+        """
         if len(gt) != len(pred):
             raise AssertionError("prediction size does not match with ground-truth labels size")
 
         for gt_word, pred_word in zip(gt, pred):
-            _raw, _caseless, _unidecode, _unicase = string_match(gt_word, pred_word)
+            _raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
             self.raw += int(_raw)
             self.caseless += int(_caseless)
-            self.unidecode += int(_unidecode)
+            self.anyascii += int(_anyascii)
             self.unicase += int(_unicase)
 
-        self.total += len(gt)
+        self.total += len(gt)
+
-[docs] +[docs] def summary(self) -> Dict[str, float]: """Computes the aggregated metrics - Returns: - a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode - counterpart and its lower-case unidecode counterpart + Returns + ------- + a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii + counterpart and its lower-case anyascii counterpart """ if self.total == 0: raise AssertionError("you need to update the metric before getting the summary") @@ -384,7 +419,7 @@

Source code for doctr.utils.metrics

         return dict(
             raw=self.raw / self.total,
             caseless=self.caseless / self.total,
-            unidecode=self.unidecode / self.total,
+            anyascii=self.anyascii / self.total,
             unicase=self.unicase / self.total,
         )
@@ -392,23 +427,25 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.raw = 0
         self.caseless = 0
-        self.unidecode = 0
+        self.anyascii = 0
         self.unicase = 0
         self.total = 0
def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray: - """Compute the IoU between two sets of bounding boxes + """Computes the IoU between two sets of bounding boxes Args: + ---- boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax) boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax) + Returns: + ------- the IoU matrix of shape (N, M) """ - - iou_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) + iou_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0: l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1) @@ -419,107 +456,54 @@

Source code for doctr.utils.metrics

         right = np.minimum(r1, r2.T)
         bot = np.minimum(b1, b2.T)
 
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
+        intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
         union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
         iou_mat = intersection / union
 
     return iou_mat
 
 
-def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
-    """Compute the IoA (intersection over area) between two sets of bounding boxes:
-    ioa(i, j) = inter(i, j) / area(i)
-
-    Args:
-        boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
-        boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
-    Returns:
-        the IoA matrix of shape (N, M)
-    """
-
-    ioa_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
-
-    if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
-        l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
-        l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
-
-        left = np.maximum(l1, l2.T)
-        top = np.maximum(t1, t2.T)
-        right = np.minimum(r1, r2.T)
-        bot = np.minimum(b1, b2.T)
-
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
-        area = (r1 - l1) * (b1 - t1)
-        ioa_mat = intersection / area
-
-    return ioa_mat
-
-
-def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
-    """Compute the IoU between two sets of boolean masks
+def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
+    """Computes the IoU between two sets of rotated bounding boxes
 
     Args:
-        masks_1: boolean masks of shape (N, H, W)
-        masks_2: boolean masks of shape (M, H, W)
+    ----
+        polys_1: rotated bounding boxes of shape (N, 4, 2)
+        polys_2: rotated bounding boxes of shape (M, 4, 2)
+        mask_shape: spatial shape of the intermediate masks
+        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
 
     Returns:
+    -------
         the IoU matrix of shape (N, M)
     """
+    if polys_1.ndim != 3 or polys_2.ndim != 3:
+        raise AssertionError("expects boxes to be in format (N, 4, 2)")
 
-    if masks_1.shape[1:] != masks_2.shape[1:]:
-        raise AssertionError("both boolean masks should have the same spatial shape")
+    iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
 
-    iou_mat = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
+    shapely_polys_1 = [Polygon(poly) for poly in polys_1]
+    shapely_polys_2 = [Polygon(poly) for poly in polys_2]
 
-    if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
-        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...])
-        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...])
-        axes = tuple(range(2, masks_1.ndim + 1))
-        iou_mat = intersection.sum(axis=axes) / union.sum(axis=axes)
+    for i, poly1 in enumerate(shapely_polys_1):
+        for j, poly2 in enumerate(shapely_polys_2):
+            intersection_area = poly1.intersection(poly2).area
+            union_area = poly1.area + poly2.area - intersection_area
+            iou_mat[i, j] = intersection_area / union_area
 
     return iou_mat
 
 
-def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
-    """Convert boxes to masks
-
-    Args:
-        boxes: rotated bounding boxes of shape (N, 5) in format (x, y, w, h, alpha)
-        shape: spatial shapes of the output masks
-
-    Returns:
-        the boolean masks of shape (N, H, W)
-    """
-
-    masks = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
-
-    if boxes.shape[0] > 0:
-        # Get absolute coordinates
-        if boxes.dtype != np.int:
-            abs_boxes = boxes.copy()
-            abs_boxes[:, [0, 2]] = abs_boxes[:, [0, 2]] * shape[1]
-            abs_boxes[:, [1, 3]] = abs_boxes[:, [1, 3]] * shape[0]
-            abs_boxes = abs_boxes.round().astype(np.int)
-        else:
-            abs_boxes = boxes
-            abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
-
-        # TODO: optimize slicing to improve vectorization
-        for idx, _box in enumerate(abs_boxes):
-            box = rbbox_to_polygon(_box)
-            cv2.fillPoly(masks[idx], [np.array(box, np.int32)], 1)
-
-    return masks.astype(bool)
-
-
-def nms(boxes: np.ndarray, thresh: float = .5) -> List[int]:
+def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
     """Perform non-max suppression, borrowed from <https://github.com/rbgirshick/fast-rcnn>`_.
 
     Args:
+    ----
         boxes: np array of straight boxes: (*, 5), (xmin, ymin, xmax, ymax, score)
         thresh: iou threshold to perform box suppression.
 
     Returns:
+    -------
         A list of box indexes to keep
     """
     x1 = boxes[:, 0]
@@ -551,66 +535,71 @@ 

Source code for doctr.utils.metrics

 
 
 
-[docs] +[docs] class LocalizationConfusion: - """Implements common confusion metrics and mean IoU for localization evaluation. + r"""Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ - Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) + \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ + Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ + Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ + meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \\forall y \\in \\mathcal{B}, - g_X(y) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, + \forall y \in \mathcal{B}, + g_X(y) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import LocalizationConfusion - >>> metric = LocalizationConfusion(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import LocalizationConfusion + >>> metric = LocalizationConfusion(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: + """Updates the metric + Args: + ---- + gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + """ if preds.shape[0] > 0: # Compute IoU - if self.rotated_bbox: - mask_gts = rbox_to_mask(gts, shape=self.mask_shape) - mask_preds = rbox_to_mask(preds, shape=self.mask_shape) - iou_mat = mask_iou(mask_gts, mask_preds) + if self.use_polygons: + iou_mat = polygon_iou(gts, preds) else: iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=1).sum()) + self.tot_iou += float(iou_mat.max(axis=0).sum()) # Assign pairs gt_indices, pred_indices = linear_sum_assignment(-iou_mat) @@ -618,17 +607,18 @@

Source code for doctr.utils.metrics

 
         # Update counts
         self.num_gts += gts.shape[0]
-        self.num_preds += preds.shape[0]
+        self.num_preds += preds.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: """Computes the aggregated metrics - Returns: + Returns + ------- a tuple with the recall, precision and meanIoU scores """ - # Recall recall = self.matches / self.num_gts if self.num_gts > 0 else None @@ -636,7 +626,7 @@

Source code for doctr.utils.metrics

         precision = self.matches / self.num_preds if self.num_preds > 0 else None
 
         # mean IoU
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -645,64 +635,65 @@

Source code for doctr.utils.metrics

         self.num_gts = 0
         self.num_preds = 0
         self.matches = 0
-        self.tot_iou = 0.
+ self.tot_iou = 0.0
-[docs] +[docs] class OCRMetric: - """Implements end-to-end OCR metric. + r"""Implements an end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, - \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ - Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) + \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, + \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ + Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, - h_{B,L}(b, l) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ - & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, - :math:`\\mathcal{L}` is the set of possible character sequences, + \forall (b, l) \in \mathcal{B} \times \mathcal{L}, + h_{B,L}(b, l) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import OCRMetric - >>> metric = OCRMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - ['hello'], ['hello', 'world']) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import OCRMetric + >>> metric = OCRMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> ['hello'], ['hello', 'world']) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update( self, gt_boxes: np.ndarray, @@ -710,50 +701,58 @@

Source code for doctr.utils.metrics

         gt_labels: List[str],
         pred_labels: List[str],
     ) -> None:
+        """Updates the metric
 
+        Args:
+        ----
+            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
+            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
+            gt_labels: a list of N string labels
+            pred_labels: a list of M string labels
+        """
         if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
-            raise AssertionError("there should be the same number of boxes and string both for the ground truth "
-                                 "and the predictions")
+            raise AssertionError(
+                "there should be the same number of boxes and string both for the ground truth and the predictions"
+            )
 
         # Compute IoU
         if pred_boxes.shape[0] > 0:
-            if self.rotated_bbox:
-                mask_gts = rbox_to_mask(gt_boxes, shape=self.mask_shape)
-                mask_preds = rbox_to_mask(pred_boxes, shape=self.mask_shape)
-                iou_mat = mask_iou(mask_gts, mask_preds)
+            if self.use_polygons:
+                iou_mat = polygon_iou(gt_boxes, pred_boxes)
             else:
                 iou_mat = box_iou(gt_boxes, pred_boxes)
 
-            self.tot_iou += float(iou_mat.max(axis=1).sum())
+            self.tot_iou += float(iou_mat.max(axis=0).sum())
 
             # Assign pairs
             gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
             is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
             # String comparison
             for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
-                _raw, _caseless, _unidecode, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
+                _raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
                 self.raw_matches += int(_raw)
                 self.caseless_matches += int(_caseless)
-                self.unidecode_matches += int(_unidecode)
+                self.anyascii_matches += int(_anyascii)
                 self.unicase_matches += int(_unicase)
 
         self.num_gts += gt_boxes.shape[0]
-        self.num_preds += pred_boxes.shape[0]
+        self.num_preds += pred_boxes.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float]], Optional[float]]: """Computes the aggregated metrics - Returns: - a tuple with the recall & precision for each string comparison flexibility and the mean IoU + Returns + ------- + a tuple with the recall & precision for each string comparison and the mean IoU """ - # Recall recall = dict( raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None, caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None, - unidecode=self.unidecode_matches / self.num_gts if self.num_gts > 0 else None, + anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None, unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None, ) @@ -761,12 +760,12 @@

Source code for doctr.utils.metrics

         precision = dict(
             raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
             caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
-            unidecode=self.unidecode_matches / self.num_preds if self.num_preds > 0 else None,
+            anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
             unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
         )
 
         # mean IoU (overall detected boxes)
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -774,12 +773,136 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.num_gts = 0
         self.num_preds = 0
-        self.tot_iou = 0.
+        self.tot_iou = 0.0
         self.raw_matches = 0
         self.caseless_matches = 0
-        self.unidecode_matches = 0
+        self.anyascii_matches = 0
         self.unicase_matches = 0
+ + +
+[docs] +class DetectionMetric: + r"""Implements an object detection metric. + + The aggregated metrics are computed as follows: + + .. math:: + \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, + \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ + Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + + with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and + :math:`y`, and the function :math:`h_{B, C}` defined as: + + .. math:: + \forall (b, c) \in \mathcal{B} \times \mathcal{C}, + h_{B,C}(b, c) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{C}` is the set of possible class indices, + :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. + + >>> import numpy as np + >>> from doctr.utils import DetectionMetric + >>> metric = DetectionMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) + >>> metric.summary() + + Args: + ---- + iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format + """ + + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: + self.iou_thresh = iou_thresh + self.use_polygons = use_polygons + self.reset() + +
+[docs] + def update( + self, + gt_boxes: np.ndarray, + pred_boxes: np.ndarray, + gt_labels: np.ndarray, + pred_labels: np.ndarray, + ) -> None: + """Updates the metric + + Args: + ---- + gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + gt_labels: an array of class indices of shape (N,) + pred_labels: an array of class indices of shape (M,) + """ + if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: + raise AssertionError( + "there should be the same number of boxes and string both for the ground truth and the predictions" + ) + + # Compute IoU + if pred_boxes.shape[0] > 0: + if self.use_polygons: + iou_mat = polygon_iou(gt_boxes, pred_boxes) + else: + iou_mat = box_iou(gt_boxes, pred_boxes) + + self.tot_iou += float(iou_mat.max(axis=0).sum()) + + # Assign pairs + gt_indices, pred_indices = linear_sum_assignment(-iou_mat) + is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh + # Category comparison + self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) + + self.num_gts += gt_boxes.shape[0] + self.num_preds += pred_boxes.shape[0]
+ + +
+[docs] + def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: + """Computes the aggregated metrics + + Returns + ------- + a tuple with the recall & precision for each class prediction and the mean IoU + """ + # Recall + recall = self.num_matches / self.num_gts if self.num_gts > 0 else None + + # Precision + precision = self.num_matches / self.num_preds if self.num_preds > 0 else None + + # mean IoU (overall detected boxes) + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None + + return recall, precision, mean_iou
+ + + def reset(self) -> None: + self.num_gts = 0 + self.num_preds = 0 + self.tot_iou = 0.0 + self.num_matches = 0
+
@@ -812,8 +935,8 @@

Source code for doctr.utils.metrics

       
     
   
- - + + diff --git a/v0.3.0/_modules/doctr/utils/visualization.html b/v0.3.0/_modules/doctr/utils/visualization.html index 8e7dcca811..c818be6d7b 100644 --- a/v0.3.0/_modules/doctr/utils/visualization.html +++ b/v0.3.0/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.visualization

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import colorsys
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import matplotlib.pyplot as plt
-from matplotlib.figure import Figure
+import cv2
 import matplotlib.patches as patches
-import mplcursors
-from PIL import ImageFont, ImageDraw, Image
+import matplotlib.pyplot as plt
 import numpy as np
-import cv2
-from typing import Tuple, List, Dict, Any, Union
+from matplotlib.figure import Figure
 
-from .common_types import BoundingBox, RotatedBbox
+from .common_types import BoundingBox, Polygon4P
 
-__all__ = ['visualize_page', 'synthetize_page']
+__all__ = ["visualize_page", "visualize_kie_page", "draw_boxes"]
 
 
-def create_rect_patch(
-    geometry: Union[BoundingBox, RotatedBbox],
-    label: str,
+def rect_patch(
+    geometry: BoundingBox,
     page_dimensions: Tuple[int, int],
-    color: Tuple[int, int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
     alpha: float = 0.3,
     linewidth: int = 2,
     fill: bool = True,
-) -> patches.Patch:
-    """Create a matplotlib patch (rectangle) bounding the element
+    preserve_aspect_ratio: bool = False,
+) -> patches.Rectangle:
+    """Create a matplotlib rectangular patch for the element
 
     Args:
+    ----
         geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
         label: label to display when hovered
-        page_dimensions: dimensions of the Page
         color: color to draw box
         alpha: opacity parameter to fill the boxes, 0 = transparent
         linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
 
     Returns:
+    -------
         a rectangular Patch
     """
+    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
     height, width = page_dimensions
-    if len(geometry) == 5:
-        x, y, w, h, a = geometry  # type: ignore[misc]
-        x, w = x * width, w * width
-        y, h = y * height, h * height
-        points = cv2.boxPoints(((x, y), (w, h), a))
-        return patches.Polygon(
-            points,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
-    else:
-        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
-        xmin, xmax = xmin * width, xmax * width
-        ymin, ymax = ymin * height, ymax * height
-        return patches.Rectangle(
-            (xmin, ymin),
-            xmax - xmin,
-            ymax - ymin,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
+    (xmin, ymin), (xmax, ymax) = geometry
+    # Switch to absolute coords
+    if preserve_aspect_ratio:
+        width = height = max(height, width)
+    xmin, w = xmin * width, (xmax - xmin) * width
+    ymin, h = ymin * height, (ymax - ymin) * height
+
+    return patches.Rectangle(
+        (xmin, ymin),
+        w,
+        h,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def polygon_patch(
+    geometry: np.ndarray,
+    page_dimensions: Tuple[int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
+    alpha: float = 0.3,
+    linewidth: int = 2,
+    fill: bool = True,
+    preserve_aspect_ratio: bool = False,
+) -> patches.Polygon:
+    """Create a matplotlib polygon patch for the element
+
+    Args:
+    ----
+        geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
+        label: label to display when hovered
+        color: color to draw box
+        alpha: opacity parameter to fill the boxes, 0 = transparent
+        linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
+
+    Returns:
+    -------
+        a polygon Patch
+    """
+    if not geometry.shape == (4, 2):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
+    height, width = page_dimensions
+    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
+    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
+
+    return patches.Polygon(
+        geometry,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def create_obj_patch(
+    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
+    page_dimensions: Tuple[int, int],
+    **kwargs: Any,
+) -> patches.Patch:
+    """Create a matplotlib patch for the element
+
+    Args:
+    ----
+        geometry: bounding box (straight or rotated) of the element
+        page_dimensions: dimensions of the page in format (height, width)
+        **kwargs: keyword arguments for the patch
+
+    Returns:
+    -------
+        a matplotlib Patch
+    """
+    if isinstance(geometry, tuple):
+        if len(geometry) == 2:  # straight word BB (2 pts)
+            return rect_patch(geometry, page_dimensions, **kwargs)
+        elif len(geometry) == 4:  # rotated word BB (4 pts)
+            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)
+    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
+        return polygon_patch(geometry, page_dimensions, **kwargs)
+    raise ValueError("invalid geometry format")
+
+
+def get_colors(num_colors: int) -> List[Tuple[float, float, float]]:
+    """Generate num_colors color for matplotlib
+
+    Args:
+    ----
+        num_colors: number of colors to generate
+
+    Returns:
+    -------
+        colors: list of generated colors
+    """
+    colors = []
+    for i in np.arange(0.0, 360.0, 360.0 / num_colors):
+        hue = i / 360.0
+        lightness = (50 + np.random.rand() * 10) / 100.0
+        saturation = (90 + np.random.rand() * 10) / 100.0
+        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
+    return colors
 
 
 
-[docs] +[docs] def visualize_page( page: Dict[str, Any], image: np.ndarray, @@ -359,18 +472,18 @@

Source code for doctr.utils.visualization

 ) -> Figure:
     """Visualize a full page with predicted blocks, lines and words
 
-    Example::
-        >>> import numpy as np
-        >>> import matplotlib.pyplot as plt
-        >>> from doctr.utils.visualization import visualize_page
-        >>> from doctr.models import ocr_db_crnn
-        >>> model = ocr_db_crnn(pretrained=True)
-        >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
-        >>> out = model([[input_page]])
-        >>> visualize_page(out[0].pages[0].export(), input_page)
-        >>> plt.show()
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from doctr.utils.visualization import visualize_page
+    >>> from doctr.models import ocr_db_crnn
+    >>> model = ocr_db_crnn(pretrained=True)
+    >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
+    >>> out = model([[input_page]])
+    >>> visualize_page(out[0].pages[0].export(), input_page)
+    >>> plt.show()
 
     Args:
+    ----
         page: the exported Page of a Document
         image: np array of the page, needs to have the same shape than page['dimensions']
         words_only: whether only words should be displayed
@@ -378,6 +491,11 @@ 

Source code for doctr.utils.visualization

         scale: figsize of the largest windows side
         interactive: whether the plot should be interactive
         add_labels: for static plot, adds text labels on top of bounding box
+        **kwargs: keyword arguments for the polygon patch
+
+    Returns:
+    -------
+        the matplotlib figure
     """
     # Get proper scale and aspect ratio
     h, w = image.shape[:2]
@@ -386,128 +504,189 @@ 

Source code for doctr.utils.visualization

     # Display the image
     ax.imshow(image)
     # hide both axis
-    ax.axis('off')
+    ax.axis("off")
 
     if interactive:
         artists: List[patches.Patch] = []  # instantiate an empty list of patches (to be drawn on the page)
 
-    for block in page['blocks']:
+    for block in page["blocks"]:
         if not words_only:
-            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
+            rect = create_obj_patch(
+                block["geometry"], page["dimensions"], label="block", color=(0, 1, 0), linewidth=1, **kwargs
+            )
             # add patch on figure
             ax.add_patch(rect)
             if interactive:
                 # add patch to cursor's artists
                 artists.append(rect)
 
-        for line in block['lines']:
+        for line in block["lines"]:
             if not words_only:
-                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
+                rect = create_obj_patch(
+                    line["geometry"], page["dimensions"], label="line", color=(1, 0, 0), linewidth=1, **kwargs
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
-            for word in line['words']:
-                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
-                                         page['dimensions'], (0, 0, 1), **kwargs)
+            for word in line["words"]:
+                rect = create_obj_patch(
+                    word["geometry"],
+                    page["dimensions"],
+                    label=f"{word['value']} (confidence: {word['confidence']:.2%})",
+                    color=(0, 0, 1),
+                    **kwargs,
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
                 elif add_labels:
-                    if len(word['geometry']) == 5:
+                    if len(word["geometry"]) == 5:
                         text_loc = (
-                            int(page['dimensions'][1] * (word['geometry'][0] - word['geometry'][2] / 2)),
-                            int(page['dimensions'][0] * (word['geometry'][1] - word['geometry'][3] / 2))
+                            int(page["dimensions"][1] * (word["geometry"][0] - word["geometry"][2] / 2)),
+                            int(page["dimensions"][0] * (word["geometry"][1] - word["geometry"][3] / 2)),
                         )
                     else:
                         text_loc = (
-                            int(page['dimensions'][1] * word['geometry'][0][0]),
-                            int(page['dimensions'][0] * word['geometry'][0][1])
+                            int(page["dimensions"][1] * word["geometry"][0][0]),
+                            int(page["dimensions"][0] * word["geometry"][0][1]),
+                        )
+
+                    if len(word["geometry"]) == 2:
+                        # We draw only if boxes are in straight format
+                        ax.text(
+                            *text_loc,
+                            word["value"],
+                            size=10,
+                            alpha=0.5,
+                            color=(0, 0, 1),
                         )
-                    ax.text(
-                        *text_loc,
-                        word['value'],
-                        size=10,
-                        alpha=0.5,
-                        color=(0, 0, 1),
-                    )
 
         if display_artefacts:
-            for artefact in block['artefacts']:
-                rect = create_rect_patch(
-                    artefact['geometry'],
-                    'artefact',
-                    page['dimensions'],
-                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
+            for artefact in block["artefacts"]:
+                rect = create_obj_patch(
+                    artefact["geometry"],
+                    page["dimensions"],
+                    label="artefact",
+                    color=(0.5, 0.5, 0.5),
                     linewidth=1,
-                    **kwargs
+                    **kwargs,
                 )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
     if interactive:
+        import mplcursors
+
         # Create mlp Cursor to hover patches in artists
         mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
-    fig.tight_layout(pad=0.)
+    fig.tight_layout(pad=0.0)
 
     return fig
-def synthetize_page( +def visualize_kie_page( page: Dict[str, Any], - draw_proba: bool = False, - font_size: int = 13, -) -> np.ndarray: - """Draw a the content of the element page (OCR response) on a blank page. + image: np.ndarray, + words_only: bool = False, + display_artefacts: bool = True, + scale: float = 10, + interactive: bool = True, + add_labels: bool = True, + **kwargs: Any, +) -> Figure: + """Visualize a full page with predicted blocks, lines and words + + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from doctr.utils.visualization import visualize_page + >>> from doctr.models import ocr_db_crnn + >>> model = ocr_db_crnn(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([[input_page]]) + >>> visualize_kie_page(out[0].pages[0].export(), input_page) + >>> plt.show() Args: - page: exported Page object to represent - draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 - font_size: size of the font, default font = 13 + ---- + page: the exported Page of a Document + image: np array of the page, needs to have the same shape than page['dimensions'] + words_only: whether only words should be displayed + display_artefacts: whether artefacts should be displayed + scale: figsize of the largest windows side + interactive: whether the plot should be interactive + add_labels: for static plot, adds text labels on top of bounding box + **kwargs: keyword arguments for the polygon patch - Return: - A np array (drawn page) + Returns: + ------- + the matplotlib figure """ - # Draw template - h, w = page["dimensions"] - response = 255 * np.ones((h, w, 3), dtype=np.int32) + # Get proper scale and aspect ratio + h, w = image.shape[:2] + size = (scale * w / h, scale) if h > w else (scale, h / w * scale) + fig, ax = plt.subplots(figsize=size) + # Display the image + ax.imshow(image) + # hide both axis + ax.axis("off") - # Draw each word - for block in page["blocks"]: - for line in block["lines"]: - for word in line["words"]: - # Get aboslute word geometry - (xmin, ymin), (xmax, ymax) = word["geometry"] - xmin, xmax = int(w * xmin), int(w * xmax) - ymin, ymax = int(h * ymin), int(h * ymax) - - # White drawing context adapted to font size, 0.75 factor to convert pts --> pix - h_box, w_box = ymax - ymin, xmax - xmin - h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75)) - img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255)) - d = ImageDraw.Draw(img) - - # Draw in black the value of the word - d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0)) - - # Resize back to box size - img = img.resize((w_box, h_box), Image.NEAREST) - - # Colorize if draw_proba - if draw_proba: - p = int(255 * word["confidence"]) - mask = np.where(np.array(img) == 0, 1, 0) - proba = np.array([255 - p, 0, p]) - color = mask * proba[np.newaxis, np.newaxis, :] - white_mask = 255 * (1 - mask) - img = color + white_mask - - # Write to response page - response[ymin:ymax, xmin:xmax, :] = np.array(img) - - return response + if interactive: + artists: List[patches.Patch] = [] # instantiate an empty list of patches (to be drawn on the page) + + colors = {k: color for color, k in zip(get_colors(len(page["predictions"])), page["predictions"])} + for key, value in page["predictions"].items(): + for prediction in value: + if not words_only: + rect = create_obj_patch( + prediction["geometry"], + page["dimensions"], + label=f"{key} \n {prediction['value']} (confidence: {prediction['confidence']:.2%}", + color=colors[key], + linewidth=1, + **kwargs, + ) + # add patch on figure + ax.add_patch(rect) + if interactive: + # add patch to cursor's artists + artists.append(rect) + + if interactive: + import mplcursors + + # Create mlp Cursor to hover patches in artists + mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label())) + fig.tight_layout(pad=0.0) + + return fig + + +def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None: + """Draw an array of relative straight boxes on an image + + Args: + ---- + boxes: array of relative boxes, of shape (*, 4) + image: np array, float32 or uint8 + color: color to use for bounding box edges + **kwargs: keyword arguments from `matplotlib.pyplot.plot` + """ + h, w = image.shape[:2] + # Convert boxes to absolute coords + _boxes = deepcopy(boxes) + _boxes[:, [0, 2]] *= w + _boxes[:, [1, 3]] *= h + _boxes = _boxes.astype(np.int32) + for box in _boxes.tolist(): + xmin, ymin, xmax, ymax = box + image = cv2.rectangle( + image, (xmin, ymin), (xmax, ymax), color=color if isinstance(color, tuple) else (0, 0, 255), thickness=2 + ) + plt.imshow(image) + plt.plot(**kwargs)
@@ -540,8 +719,8 @@

Source code for doctr.utils.visualization

       
     
   
- - + + diff --git a/v0.3.0/_modules/index.html b/v0.3.0/_modules/index.html index e86abcd4d4..5793c44f20 100644 --- a/v0.3.0/_modules/index.html +++ b/v0.3.0/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -225,20 +225,42 @@ - - + + diff --git a/v0.3.0/_sources/changelog.rst.txt b/v0.3.0/_sources/changelog.rst.txt index 430097d6c8..35befe7b96 100644 --- a/v0.3.0/_sources/changelog.rst.txt +++ b/v0.3.0/_sources/changelog.rst.txt @@ -1,6 +1,54 @@ Changelog ========= +v0.10.0 (2024-10-21) +------------------- +Release note: `v0.10.0 `_ + +v0.9.0 (2024-08-08) +------------------- +Release note: `v0.9.0 `_ + +v0.8.1 (2024-03-04) +------------------- +Release note: `v0.8.1 `_ + +v0.8.0 (2024-02-28) +------------------- +Release note: `v0.8.0 `_ + +v0.7.0 (2023-09-09) +------------------- +Release note: `v0.7.0 `_ + +v0.6.0 (2022-09-29) +------------------- +Release note: `v0.6.0 `_ + +v0.5.1 (2022-03-22) +------------------- +Release note: `v0.5.1 `_ + +v0.5.0 (2021-12-31) +------------------- +Release note: `v0.5.0 `_ + +v0.4.1 (2021-11-22) +------------------- +Release note: `v0.4.1 `_ + +v0.4.0 (2021-10-01) +------------------- +Release note: `v0.4.0 `_ + +v0.3.1 (2021-08-27) +------------------- +Release note: `v0.3.1 `_ + +v0.3.0 (2021-07-02) +------------------- +Release note: `v0.3.0 `_ + v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.3.0/_sources/datasets.rst.txt b/v0.3.0/_sources/datasets.rst.txt deleted file mode 100644 index 354122f1e5..0000000000 --- a/v0.3.0/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.datasets.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.3.0/_sources/documents.rst.txt b/v0.3.0/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.3.0/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.3.0/_sources/getting_started/installing.rst.txt b/v0.3.0/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/v0.3.0/_sources/getting_started/installing.rst.txt +++ b/v0.3.0/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/v0.3.0/_sources/index.rst.txt b/v0.3.0/_sources/index.rst.txt index fc3ff89fdf..53251db142 100644 --- a/v0.3.0/_sources/index.rst.txt +++ b/v0.3.0/_sources/index.rst.txt @@ -1,7 +1,8 @@ -DocTR: Document Text Recognition -================================ +******************************** +docTR: Document Text Recognition +******************************** -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -9,38 +10,29 @@ State-of-the-art Optical Character Recognition made seamless & accessible to any DocTR provides an easy and powerful way to extract valuable information from your documents: -* |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. +* |:receipt:| **for automation**: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. -Welcome to the documentation of `DocTR `_! - - Main Features ------------- * |:robot:| Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor -* |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract +* |:rocket:| State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, small dependencies -* |:tools:| Daily maintained -* |:factory:| Easy integration - +* |:bird:| Light package, minimal dependencies +* |:tools:| Actively maintained by Mindee +* |:factory:| Easy integration (available templates for browser demo & API deployment) -Getting Started ---------------- .. toctree:: :maxdepth: 2 + :caption: Getting started + :hidden: - installing - - -Build & train your predictor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) -* Fine-tune or train from scratch any detection or recognition model to specialize on your data + getting_started/installing + notebooks Model zoo @@ -48,36 +40,83 @@ Model zoo Text detection models """"""""""""""""""""" - * `DBNet `_ (Differentiable Binarization) - * `LinkNet `_ +* DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ +* LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ +* FAST from `"FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation" `_ Text recognition models """"""""""""""""""""""" - * `SAR `_ (Show, Attend and Read) - * `CRNN `_ (Convolutional Recurrent Neural Network) - * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) +* SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ +* CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ +* MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ +* ViTSTR from `"Vision Transformer for Fast and Efficient Scene Text Recognition" `_ +* PARSeq from `"Scene Text Recognition with Permuted Autoregressive Sequence Models" `_ Supported datasets ^^^^^^^^^^^^^^^^^^ - * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. - * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. - * SROIE from `ICDAR 2019 `_. +* FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. +* CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. +* SROIE from `ICDAR 2019 `_. +* IIIT-5k from `CVIT `_. +* Street View Text from `"End-to-End Scene Text Recognition" `_. +* SynthText from `Visual Geometry Group `_. +* SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. +* IC03 from `ICDAR 2003 `_. +* IC13 from `ICDAR 2013 `_. +* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. +* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" `_. +* IIITHWS from `"Generating Synthetic Data for Text Recognition" `_. +* WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" `_. .. toctree:: :maxdepth: 2 - :caption: Notes + :caption: Using docTR + :hidden: - changelog + using_doctr/using_models + using_doctr/using_datasets + using_doctr/using_contrib_modules + using_doctr/sharing_models + using_doctr/using_model_export + using_doctr/custom_models_training + using_doctr/running_on_aws + + +.. toctree:: + :maxdepth: 2 + :caption: Community + :hidden: + + community/resources .. toctree:: :maxdepth: 2 :caption: Package Reference + :hidden: - datasets - documents - models - transforms - utils + modules/contrib + modules/datasets + modules/io + modules/models + modules/transforms + modules/utils + + +.. toctree:: + :maxdepth: 2 + :caption: Contributing + :hidden: + + contributing/code_of_conduct + contributing/contributing + + +.. toctree:: + :maxdepth: 2 + :caption: Notes + :hidden: + + changelog diff --git a/v0.3.0/_sources/installing.rst.txt b/v0.3.0/_sources/installing.rst.txt deleted file mode 100644 index 5c8779dc1c..0000000000 --- a/v0.3.0/_sources/installing.rst.txt +++ /dev/null @@ -1,46 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: - -* TensorFlow: `installation page `_. -* PyTorch: `installation page `_. - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.3.0/_sources/models.rst.txt b/v0.3.0/_sources/models.rst.txt deleted file mode 100644 index 9830c6c153..0000000000 --- a/v0.3.0/_sources/models.rst.txt +++ /dev/null @@ -1,215 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet16 - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 -.. autofunction:: doctr.models.recognition.master - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.3.0/_sources/transforms.rst.txt b/v0.3.0/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.3.0/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.3.0/_sources/utils.rst.txt b/v0.3.0/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.3.0/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.3.0/_static/basic.css b/v0.3.0/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.3.0/_static/basic.css +++ b/v0.3.0/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.3.0/_static/doctools.js b/v0.3.0/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.3.0/_static/doctools.js +++ b/v0.3.0/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.3.0/_static/documentation_options.js b/v0.3.0/_static/documentation_options.js index a7b5cbe04a..4f656fdbea 100644 --- a/v0.3.0/_static/documentation_options.js +++ b/v0.3.0/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.3.0a0-git', + VERSION: '0.10.1a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.3.0/_static/language_data.js b/v0.3.0/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.3.0/_static/language_data.js +++ b/v0.3.0/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.3.0/_static/searchtools.js b/v0.3.0/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.3.0/_static/searchtools.js +++ b/v0.3.0/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.3.0/changelog.html b/v0.3.0/changelog.html index eafac3a877..fc45a50384 100644 --- a/v0.3.0/changelog.html +++ b/v0.3.0/changelog.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + Changelog - docTR documentation @@ -226,20 +226,42 @@ + diff --git a/v0.3.0/community/resources.html b/v0.3.0/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.3.0/community/resources.html +++ b/v0.3.0/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.3.0/contributing/code_of_conduct.html b/v0.3.0/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/v0.3.0/contributing/code_of_conduct.html +++ b/v0.3.0/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/v0.3.0/contributing/contributing.html b/v0.3.0/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/v0.3.0/contributing/contributing.html +++ b/v0.3.0/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/v0.3.0/datasets.html b/v0.3.0/datasets.html deleted file mode 100644 index 193e576c57..0000000000 --- a/v0.3.0/datasets.html +++ /dev/null @@ -1,578 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
  • sos – optional encoding of Start Of String

  • -
  • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/documents.html b/v0.3.0/documents.html deleted file mode 100644 index 98cbb2c5ef..0000000000 --- a/v0.3.0/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/genindex.html b/v0.3.0/genindex.html index a19b433943..21520455b4 100644 --- a/v0.3.0/genindex.html +++ b/v0.3.0/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -224,20 +224,42 @@

+
+

U

+ + +
+
+

V

@@ -561,7 +711,13 @@

V

W

+
@@ -599,8 +755,8 @@

W

- - + + diff --git a/v0.3.0/getting_started/installing.html b/v0.3.0/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/v0.3.0/getting_started/installing.html +++ b/v0.3.0/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/v0.3.0/index.html b/v0.3.0/index.html index 4c6a28c66a..3a06afc6d9 100644 --- a/v0.3.0/index.html +++ b/v0.3.0/index.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + docTR documentation @@ -226,20 +226,42 @@
-

DocTR: Document Text Recognition

-

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta)

+

docTR: Document Text Recognition

+

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch

https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png

DocTR provides an easy and powerful way to extract valuable information from your documents:

    -
  • 🧾 for automation: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • +
  • 🧾 for automation: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • 👩‍🔬 for research: quickly compare your own architectures speed & performances with state-of-art models on public datasets.

-

Welcome to the documentation of DocTR!

Main Features

  • 🤖 Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters

  • ⚡ User-friendly, 3 lines of code to load a document and extract text with a predictor

  • -
  • 🚀 State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract

  • +
  • 🚀 State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract

  • ⚡ Optimized for inference speed on both CPU & GPU

  • -
  • 🐦 Light package, small dependencies

  • -
  • 🛠️ Daily maintained

  • -
  • 🏭 Easy integration

  • +
  • 🐦 Light package, minimal dependencies

  • +
  • 🛠️ Actively maintained by Mindee

  • +
  • 🏭 Easy integration (available templates for browser demo & API deployment)

-
-
-

Getting Started

-
-

Build & train your predictor

-
    -
  • Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained)

  • -
  • Fine-tune or train from scratch any detection or recognition model to specialize on your data

  • -
-

Model zoo

Text detection models

-
-

Text recognition models

-
-

Supported datasets

-
-
+
+
+
+
+
@@ -406,7 +381,7 @@

Supported datasets - +
Next @@ -446,10 +421,8 @@

Supported datasets + diff --git a/v0.3.0/installing.html b/v0.3.0/installing.html deleted file mode 100644 index b61c60134b..0000000000 --- a/v0.3.0/installing.html +++ /dev/null @@ -1,395 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

- -

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/models.html b/v0.3.0/models.html deleted file mode 100644 index b5cd44c9fa..0000000000 --- a/v0.3.0/models.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet16
->>> model = linknet16(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
-

MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. -Example:

-
>>> import tensorflow as tf
->>> from doctr.models import master
->>> model = master(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

59.50

62.50

75.30

70.00

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

66.00

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/modules/contrib.html b/v0.3.0/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.3.0/modules/contrib.html +++ b/v0.3.0/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.3.0/modules/datasets.html b/v0.3.0/modules/datasets.html index 456e10b172..380a986793 100644 --- a/v0.3.0/modules/datasets.html +++ b/v0.3.0/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1081,7 +1081,7 @@

Returns:

- + diff --git a/v0.3.0/modules/io.html b/v0.3.0/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/v0.3.0/modules/io.html +++ b/v0.3.0/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/v0.3.0/modules/models.html b/v0.3.0/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/v0.3.0/modules/models.html +++ b/v0.3.0/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/v0.3.0/modules/transforms.html b/v0.3.0/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/v0.3.0/modules/transforms.html +++ b/v0.3.0/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/v0.3.0/modules/utils.html b/v0.3.0/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/v0.3.0/modules/utils.html +++ b/v0.3.0/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/v0.3.0/notebooks.html b/v0.3.0/notebooks.html index f97771aebb..d36539f59e 100644 --- a/v0.3.0/notebooks.html +++ b/v0.3.0/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/v0.3.0/objects.inv b/v0.3.0/objects.inv index a22d2ce821..c1700f291b 100644 Binary files a/v0.3.0/objects.inv and b/v0.3.0/objects.inv differ diff --git a/v0.3.0/py-modindex.html b/v0.3.0/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.3.0/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.3.0/search.html b/v0.3.0/search.html index 73772822d2..d050f5eac7 100644 --- a/v0.3.0/search.html +++ b/v0.3.0/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -226,20 +226,42 @@ - - + + diff --git a/v0.3.0/searchindex.js b/v0.3.0/searchindex.js index 803f4f4bcf..6f154115ab 100644 --- a/v0.3.0/searchindex.js +++ b/v0.3.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": 5, "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": 5, "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "2023": [], "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": 3, "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": [], "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "andrej": [], "angl": 2, "ani": [1, 2, 3, 5, 6, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": [], "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "articl": [], "artifici": [], "arxiv": 5, "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": [3, 6], "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "baranovskij": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 3, "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": 2, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [1, 2, 6, 7], "box": [1, 2, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "christian": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": [], "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "cool": [], "coordin": 2, "cord": [1, 3, 5], "core": 7, "corner": [], "correct": 6, "correspond": [4, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": 4, "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "found": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gallagh": [], "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ibrahimov": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": [1, 2], "insult": [], "int": [1, 2, 5, 6, 7], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "jame": [], "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": 4, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [3, 5], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "moscardi": [], "most": 5, "mozilla": [], "multi": 3, "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": [], "netraj": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 3, 6, 7], "none": [1, 2, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 3, "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": 1, "order": [1, 2, 5], "org": 5, "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 5, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [4, 5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "parallel": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "patil": [], "pattern": [], "pdf": [2, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [3, 4], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": [], "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "realli": [], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": 4, "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "roboflow": [], "robust": 3, "root": 1, "rotat": [1, 2], "rotated_bbox": [1, 7], "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sanjin": [], "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": [3, 5], "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [1, 4], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standalon": [], "standard": 6, "start": 1, "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": [], "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unit": [], "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "verma": [], "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "video": [], "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": 4, "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "yugesh": [], "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "21": [], "22": [], "27": [], "28": 0, "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": [], "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "resourc": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/v0.3.0/transforms.html b/v0.3.0/transforms.html deleted file mode 100644 index 85e94d8a76..0000000000 --- a/v0.3.0/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.5)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[Callable[[Any], Any]])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[Callable[[Any], Any]])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: Callable[[Any], Any], p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.0/using_doctr/custom_models_training.html b/v0.3.0/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/v0.3.0/using_doctr/custom_models_training.html +++ b/v0.3.0/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/v0.3.0/using_doctr/running_on_aws.html b/v0.3.0/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/v0.3.0/using_doctr/running_on_aws.html +++ b/v0.3.0/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/v0.3.0/using_doctr/sharing_models.html b/v0.3.0/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/v0.3.0/using_doctr/sharing_models.html +++ b/v0.3.0/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/v0.3.0/using_doctr/using_contrib_modules.html b/v0.3.0/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.3.0/using_doctr/using_contrib_modules.html +++ b/v0.3.0/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.3.0/using_doctr/using_datasets.html b/v0.3.0/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/v0.3.0/using_doctr/using_datasets.html +++ b/v0.3.0/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/v0.3.0/using_doctr/using_model_export.html b/v0.3.0/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/v0.3.0/using_doctr/using_model_export.html +++ b/v0.3.0/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/v0.3.0/using_doctr/using_models.html b/v0.3.0/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/v0.3.0/using_doctr/using_models.html +++ b/v0.3.0/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/v0.3.0/utils.html b/v0.3.0/utils.html deleted file mode 100644 index e2f223f06a..0000000000 --- a/v0.3.0/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float | None, float | None, float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/datasets/cord.html b/v0.3.1/_modules/doctr/datasets/cord.html index f98ee6901c..55b0584830 100644 --- a/v0.3.1/_modules/doctr/datasets/cord.html +++ b/v0.3.1/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.cord

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
-from doctr.utils.geometry import fit_rbbox
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['CORD']
+__all__ = ["CORD"]
 
 
 
-[docs] +[docs] class CORD(VisionDataset): """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" <https://openreview.net/pdf?id=SJl3z659UH>`_. - Example:: - >>> from doctr.datasets import CORD - >>> train_set = CORD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0 + :align: center + + >>> from doctr.datasets import CORD + >>> train_set = CORD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip', - '45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_test.zip', - '8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0", + "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8", + "cord_train.zip", + ) + + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0", + "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58", + "cord_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - - # # List images - self.root = os.path.join(self._root, 'image') - self.data: List[Tuple[str, Dict[str, Any]]] = [] + # List images + tmp_root = os.path.join(self.root, "image") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] self.train = train - self.sample_transforms = sample_transforms - for img_path in os.listdir(self.root): + np_dtype = np.float32 + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem _targets = [] - with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f: label = json.load(f) for line in label["valid_line"]: for word in line["words"]: if len(word["text"]) > 0: x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] - if rotated_bbox: - box = list(fit_rbbox(np.array([ - [x[0], y[0]], - [x[1], y[1]], - [x[2], y[2]], - [x[3], y[3]], - ], dtype=np.float32))) + box: Union[List[float], np.ndarray] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + box = np.array( + [ + [x[0], y[0]], + [x[1], y[1]], + [x[2], y[2]], + [x[3], y[3]], + ], + dtype=np_dtype, + ) else: - # Reduce 8 coords to 4 + # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax box = [min(x), min(y), max(x), max(y)] - _targets.append((word['text'], box)) + _targets.append((word["text"], box)) text_targets, box_targets = zip(*_targets) - self.data.append(( - img_path, - dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets) - )) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) + ) + for crop, label in zip(crops, list(text_targets)): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -397,8 +461,8 @@

Source code for doctr.datasets.cord

       
     
   
-
- + + diff --git a/v0.3.1/_modules/doctr/datasets/core.html b/v0.3.1/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.3.1/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/datasets/datasets/tensorflow.html b/v0.3.1/_modules/doctr/datasets/datasets/tensorflow.html deleted file mode 100644 index a236abd9fe..0000000000 --- a/v0.3.1/_modules/doctr/datasets/datasets/tensorflow.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - - - - - doctr.datasets.datasets.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.datasets.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from typing import List, Any, Tuple
-import tensorflow as tf
-
-from .base import _AbstractDataset, _VisionDataset
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset(_AbstractDataset):
-
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-
-        return img, target
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset, _VisionDataset): - pass
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/datasets/detection.html b/v0.3.1/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/v0.3.1/_modules/doctr/datasets/detection.html +++ b/v0.3.1/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.3.1/_modules/doctr/datasets/doc_artefacts.html b/v0.3.1/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/v0.3.1/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.3.1/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.3.1/_modules/doctr/datasets/funsd.html b/v0.3.1/_modules/doctr/datasets/funsd.html index 35d7ad4cf5..f08612f9fa 100644 --- a/v0.3.1/_modules/doctr/datasets/funsd.html +++ b/v0.3.1/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.funsd

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['FUNSD']
+__all__ = ["FUNSD"]
 
 
 
-[docs] +[docs] class FUNSD(VisionDataset): """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. - Example:: - >>> from doctr.datasets import FUNSD - >>> train_set = FUNSD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0 + :align: center + + >>> from doctr.datasets import FUNSD + >>> train_set = FUNSD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip' - SHA256 = 'c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f' - FILE_NAME = 'funsd.zip' + URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip" + SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f" + FILE_NAME = "funsd.zip" def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + super().__init__( + self.URL, + self.FILE_NAME, + self.SHA256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - super().__init__(self.URL, self.FILE_NAME, self.SHA256, True, **kwargs) self.train = train - self.sample_transforms = sample_transforms + np_dtype = np.float32 # Use the subset - subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data') + subfolder = os.path.join("dataset", "training_data" if train else "testing_data") # # List images - self.root = os.path.join(self._root, subfolder, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + tmp_root = os.path.join(self.root, subfolder, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f: data = json.load(f) - _targets = [(word['text'], word['box']) for block in data['form'] - for word in block['words'] if len(word['text']) > 0] + _targets = [ + (word["text"], word["box"]) + for block in data["form"] + for word in block["words"] + if len(word["text"]) > 0 + ] text_targets, box_targets = zip(*_targets) - if rotated_bbox: - # box_targets: xmin, ymin, xmax, ymax -> x, y, w, h, alpha = 0 - box_targets = [ + if use_polygons: + # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners + box_targets = [ # type: ignore[assignment] [ - (box[0] + box[2]) / 2, (box[1] + box[3]) / 2, box[2] - box[0], box[3] - box[1], 0 - ] for box in box_targets + [box[0], box[1]], + [box[2], box[1]], + [box[2], box[3]], + [box[0], box[3]], + ] + for box in box_targets ] - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=int), labels=text_targets))) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype) + ) + for crop, label in zip(crops, list(text_targets)): + # filter labels with unknown characters + if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -386,8 +453,8 @@

Source code for doctr.datasets.funsd

       
     
   
-
- + + diff --git a/v0.3.1/_modules/doctr/datasets/generator/tensorflow.html b/v0.3.1/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/v0.3.1/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.3.1/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.3.1/_modules/doctr/datasets/ic03.html b/v0.3.1/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/v0.3.1/_modules/doctr/datasets/ic03.html +++ b/v0.3.1/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.3.1/_modules/doctr/datasets/ic13.html b/v0.3.1/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/v0.3.1/_modules/doctr/datasets/ic13.html +++ b/v0.3.1/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.3.1/_modules/doctr/datasets/iiit5k.html b/v0.3.1/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/v0.3.1/_modules/doctr/datasets/iiit5k.html +++ b/v0.3.1/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.3.1/_modules/doctr/datasets/iiithws.html b/v0.3.1/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/v0.3.1/_modules/doctr/datasets/iiithws.html +++ b/v0.3.1/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.3.1/_modules/doctr/datasets/imgur5k.html b/v0.3.1/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/v0.3.1/_modules/doctr/datasets/imgur5k.html +++ b/v0.3.1/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.3.1/_modules/doctr/datasets/loader.html b/v0.3.1/_modules/doctr/datasets/loader.html index d32e6da298..ed80350ef0 100644 --- a/v0.3.1/_modules/doctr/datasets/loader.html +++ b/v0.3.1/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.loader

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import math
-import tensorflow as tf
-import numpy as np
-from typing import Optional
+from typing import Callable, Optional
 
-from .multithreading import multithread_exec
+import numpy as np
+import tensorflow as tf
 
 __all__ = ["DataLoader"]
 
@@ -293,12 +314,13 @@ 

Source code for doctr.datasets.loader

     """Collate multiple elements into batches
 
     Args:
+    ----
         samples: list of N tuples containing M elements
 
     Returns:
+    -------
         Tuple of M sequences contianing N elements each
     """
-
     batch_data = zip(*samples)
 
     tf_data = tuple(tf.stack(elt, axis=0) for elt in batch_data)
@@ -307,23 +329,23 @@ 

Source code for doctr.datasets.loader

 
 
 
-[docs] +[docs] class DataLoader: """Implements a dataset wrapper for fast data loading - Example:: - >>> from doctr.datasets import FUNSD, DataLoader - >>> train_set = CORD(train=True, download=True) - >>> train_loader = DataLoader(train_set, batch_size=32) - >>> train_iter = iter(train_loader) - >>> images, targets = next(train_iter) + >>> from doctr.datasets import CORD, DataLoader + >>> train_set = CORD(train=True, download=True) + >>> train_loader = DataLoader(train_set, batch_size=32) + >>> train_iter = iter(train_loader) + >>> images, targets = next(train_iter) Args: + ---- dataset: the dataset shuffle: whether the samples should be shuffled before passing it to the iterator batch_size: number of elements in each batch drop_last: if `True`, drops the last batch if it isn't full - workers: number of workers to use for data loading + collate_fn: function to merge samples into a batch """ def __init__( @@ -332,17 +354,22 @@

Source code for doctr.datasets.loader

         shuffle: bool = True,
         batch_size: int = 1,
         drop_last: bool = False,
-        workers: Optional[int] = None,
+        collate_fn: Optional[Callable] = None,
     ) -> None:
         self.dataset = dataset
         self.shuffle = shuffle
         self.batch_size = batch_size
         nb = len(self.dataset) / batch_size
         self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
-        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
-        self.workers = workers
+        if collate_fn is None:
+            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
+        else:
+            self.collate_fn = collate_fn
         self.reset()
 
+    def __len__(self) -> int:
+        return self.num_batches
+
     def reset(self) -> None:
         # Updates indices after each epoch
         self._num_yielded = 0
@@ -358,9 +385,9 @@ 

Source code for doctr.datasets.loader

         if self._num_yielded < self.num_batches:
             # Get next indices
             idx = self._num_yielded * self.batch_size
-            indices = self.indices[idx: min(len(self.dataset), idx + self.batch_size)]
+            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
 
-            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.workers)
+            samples = list(map(self.dataset.__getitem__, indices))
 
             batch_data = self.collate_fn(samples)
 
@@ -401,8 +428,8 @@ 

Source code for doctr.datasets.loader

       
     
   
-
- +
+ diff --git a/v0.3.1/_modules/doctr/datasets/mjsynth.html b/v0.3.1/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/v0.3.1/_modules/doctr/datasets/mjsynth.html +++ b/v0.3.1/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.3.1/_modules/doctr/datasets/ocr.html b/v0.3.1/_modules/doctr/datasets/ocr.html index 11297d5952..ce1ed8b0d4 100644 --- a/v0.3.1/_modules/doctr/datasets/ocr.html +++ b/v0.3.1/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.ocr

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple
 
-from .datasets import AbstractDataset
-from doctr.utils.geometry import fit_rbbox
+import numpy as np
 
+from .datasets import AbstractDataset
 
-__all__ = ['OCRDataset']
+__all__ = ["OCRDataset"]
 
 
 
-[docs] +[docs] class OCRDataset(AbstractDataset): """Implements an OCR dataset + >>> from doctr.datasets import OCRDataset + >>> train_set = OCRDataset(img_folder="/path/to/images", + >>> label_file="/path/to/labels.json") + >>> img, target = train_set[0] + Args: + ---- img_folder: local path to image folder (all jpg at the root) label_file: local path to the label file - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) - **kwargs: keyword arguments from `VisionDataset`. + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + **kwargs: keyword arguments from `AbstractDataset`. """ def __init__( self, img_folder: str, label_file: str, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, **kwargs: Any, ) -> None: - - self.sample_transforms = sample_transforms - self.root = img_folder + super().__init__(img_folder, **kwargs) # List images self.data: List[Tuple[str, Dict[str, Any]]] = [] - with open(label_file, 'rb') as f: + np_dtype = np.float32 + with open(label_file, "rb") as f: data = json.load(f) - for file_dic in data: + for img_name, annotations in data.items(): # Get image path - img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg' + img_name = Path(img_name) # File existence check if not os.path.exists(os.path.join(self.root, img_name)): raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") # handle empty images - if (len(file_dic["coordinates"]) == 0 or - (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")): - self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[]))) + if len(annotations["typed_words"]) == 0: + self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[]))) continue - is_valid: List[bool] = [] - box_targets: List[List[float]] = [] - for box in file_dic["coordinates"]: - if rotated_bbox: - x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32)) - box = [x, y, w, h, alpha] - is_valid.append(w > 0 and h > 0) - else: - xs, ys = zip(*box) - box = [min(xs), min(ys), max(xs), max(ys)] - is_valid.append(box[0] < box[2] and box[1] < box[3]) - if is_valid[-1]: - box_targets.append(box) + # Unpack the straight boxes (xmin, ymin, xmax, ymax) + geoms = [list(map(float, obj["geometry"][:4])) for obj in annotations["typed_words"]] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + geoms = [ + [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]] # type: ignore[list-item] + for geom in geoms + ] + + text_targets = [obj["value"] for obj in annotations["typed_words"]] - text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid] - self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
+ self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))
@@ -383,8 +402,8 @@

Source code for doctr.datasets.ocr

       
     
   
- - + + diff --git a/v0.3.1/_modules/doctr/datasets/recognition.html b/v0.3.1/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/v0.3.1/_modules/doctr/datasets/recognition.html +++ b/v0.3.1/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.3.1/_modules/doctr/datasets/sroie.html b/v0.3.1/_modules/doctr/datasets/sroie.html index 66fd4ca3e0..04cf10bda2 100644 --- a/v0.3.1/_modules/doctr/datasets/sroie.html +++ b/v0.3.1/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.sroie

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import csv
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['SROIE']
+__all__ = ["SROIE"]
 
 
 
-[docs] +[docs] class SROIE(VisionDataset): """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction" <https://arxiv.org/pdf/2103.10213.pdf>`_. - Example:: - >>> from doctr.datasets import SROIE - >>> train_set = SROIE(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0 + :align: center + + >>> from doctr.datasets import SROIE + >>> train_set = SROIE(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_train_task1.zip', - 'd4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_test.zip', - '41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0", + "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f", + "sroie2019_train_task1.zip", + ) + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0", + "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2", + "sroie2019_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - self.sample_transforms = sample_transforms self.train = train - if rotated_bbox: - raise NotImplementedError + tmp_root = os.path.join(self.root, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + np_dtype = np.float32 - # # List images - self.root = os.path.join(self._root, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - _targets = [] - with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f: - for row in csv.reader(f, delimiter=','): - # Safeguard for blank lines - if len(row) > 0: - # Label may contain commas - label = ",".join(row[8:]) - # Reduce 8 coords to 4 - p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x, p4_y = map(int, row[:8]) - left, right = min(p1_x, p2_x, p3_x, p4_x), max(p1_x, p2_x, p3_x, p4_x) - top, bot = min(p1_y, p2_y, p3_y, p4_y), max(p1_y, p2_y, p3_y, p4_y) - if len(label) > 0: - _targets.append((label, [left, top, right, bot])) - - text_targets, box_targets = zip(*_targets) - - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets))) + with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f: + _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0] + + labels = [",".join(row[8:]) for row in _rows] + # reorder coordinates (8 -> (4,2) -> + # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines + coords: np.ndarray = np.stack( + [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0 + ) + + if not use_polygons: + # xmin, ymin, xmax, ymax + coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1) + + if recognition_task: + crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords) + for crop, label in zip(crops, labels): + if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0: + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, coords)) + else: + self.data.append((img_path, dict(boxes=coords, labels=labels))) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -390,8 +444,8 @@

Source code for doctr.datasets.sroie

       
     
   
-
- + + diff --git a/v0.3.1/_modules/doctr/datasets/svhn.html b/v0.3.1/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/v0.3.1/_modules/doctr/datasets/svhn.html +++ b/v0.3.1/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.3.1/_modules/doctr/datasets/svt.html b/v0.3.1/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/v0.3.1/_modules/doctr/datasets/svt.html +++ b/v0.3.1/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.3.1/_modules/doctr/datasets/synthtext.html b/v0.3.1/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/v0.3.1/_modules/doctr/datasets/synthtext.html +++ b/v0.3.1/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.3.1/_modules/doctr/datasets/utils.html b/v0.3.1/_modules/doctr/datasets/utils.html index 2259698c0f..bde9304597 100644 --- a/v0.3.1/_modules/doctr/datasets/utils.html +++ b/v0.3.1/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.utils

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import string
 import unicodedata
+from collections.abc import Sequence
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Sequence as SequenceType
+
 import numpy as np
-from typing import List, Optional, Any
+from PIL import Image
+
+from doctr.io.image import get_img_shape
+from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
 
 from .vocabs import VOCABS
 
-__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
+__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences", "pre_transform_multiclass"]
+
+ImageTensor = TypeVar("ImageTensor")
 
 
 def translate(
     input_string: str,
     vocab_name: str,
-    unknown_char: str = '■',
+    unknown_char: str = "■",
 ) -> str:
     """Translate a string input in a given vocabulary
 
     Args:
+    ----
         input_string: input string to translate
         vocab_name: vocabulary to use (french, latin, ...)
         unknown_char: unknown character for non-translatable characters
 
     Returns:
-        A string translated in a given vocab"""
-
+    -------
+        A string translated in a given vocab
+    """
     if VOCABS.get(vocab_name) is None:
         raise KeyError("output vocabulary must be in vocabs dictionnary")
 
-    translated = ''
+    translated = ""
     for char in input_string:
         if char not in VOCABS[vocab_name]:
             # we need to translate char into a vocab char
@@ -315,51 +350,63 @@ 

Source code for doctr.datasets.utils

                 # remove whitespaces
                 continue
             # normalize character if it is not in vocab
-            char = unicodedata.normalize('NFD', char).encode('ascii', 'ignore').decode('ascii')
-            if char == '' or char not in VOCABS[vocab_name]:
+            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
+            if char == "" or char not in VOCABS[vocab_name]:
                 # if normalization fails or char still not in vocab, return unknown character)
                 char = unknown_char
         translated += char
     return translated
 
 
-def encode_sequence(
+def encode_string(
     input_string: str,
     vocab: str,
 ) -> List[int]:
     """Given a predefined mapping, encode the string to a sequence of numbers
 
     Args:
+    ----
         input_string: string to encode
         vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A list encoding the input_string"""
-
-    return list(map(vocab.index, input_string))  # type: ignore[arg-type]
+    -------
+        A list encoding the input_string
+    """
+    try:
+        return list(map(vocab.index, input_string))
+    except ValueError:
+        raise ValueError(
+            f"some characters cannot be found in 'vocab'. \
+                         Please check the input string {input_string} and the vocabulary {vocab}"
+        )
 
 
 def decode_sequence(
-    input_array: np.array,
+    input_seq: Union[np.ndarray, SequenceType[int]],
     mapping: str,
 ) -> str:
     """Given a predefined mapping, decode the sequence of numbers to a string
 
     Args:
-        input_array: array to decode
+    ----
+        input_seq: array to decode
         mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A string, decoded from input_array"""
-
-    if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
+    -------
+        A string, decoded from input_seq
+    """
+    if not isinstance(input_seq, (Sequence, np.ndarray)):
+        raise TypeError("Invalid sequence type")
+    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
         raise AssertionError("Input must be an array of int, with max less than mapping size")
-    decoded = ''.join(mapping[idx] for idx in input_array)
-    return decoded
+
+    return "".join(map(mapping.__getitem__, input_seq))
 
 
 
-[docs] +[docs] def encode_sequences( sequences: List[str], vocab: str, @@ -367,48 +414,53 @@

Source code for doctr.datasets.utils

     eos: int = -1,
     sos: Optional[int] = None,
     pad: Optional[int] = None,
-    **kwargs: Any,
+    dynamic_seq_length: bool = False,
 ) -> np.ndarray:
     """Encode character sequences using a given vocab as mapping
 
     Args:
+    ----
         sequences: the list of character sequences of size N
         vocab: the ordered vocab to use for encoding
         target_size: maximum length of the encoded data
         eos: encoding of End Of String
         sos: optional encoding of Start Of String
         pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
+        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
 
     Returns:
+    -------
         the padded encoded data as a tensor
     """
-
     if 0 <= eos < len(vocab):
         raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
 
-    if not isinstance(target_size, int):
-        target_size = max(len(w) for w in sequences)
-        if sos:
-            target_size += 1
-        if pad:
-            target_size += 1
+    if not isinstance(target_size, int) or dynamic_seq_length:
+        # Maximum string length + EOS
+        max_length = max(len(w) for w in sequences) + 1
+        if isinstance(sos, int):
+            max_length += 1
+        if isinstance(pad, int):
+            max_length += 1
+        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
 
     # Pad all sequences
-    if pad:  # pad with padding symbol
+    if isinstance(pad, int):  # pad with padding symbol
         if 0 <= pad < len(vocab):
             raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
         # In that case, add EOS at the end of the word before padding
-        encoded_data = np.full([len(sequences), target_size], pad, dtype=np.int32)
+        default_symbol = pad
     else:  # pad with eos symbol
-        encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32)
+        default_symbol = eos
+    encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
 
-    for idx, seq in enumerate(sequences):
-        encoded_seq = encode_sequence(seq, vocab)
-        if pad:  # add eos at the end of the sequence
-            encoded_seq.append(eos)
-        encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)]
+    # Encode the strings
+    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
+        if isinstance(pad, int):  # add eos at the end of the sequence
+            seq.append(eos)
+        encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)]
 
-    if sos:  # place eos symbol at the beginning of each sequence
+    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
         if 0 <= sos < len(vocab):
             raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
         encoded_data = np.roll(encoded_data, 1)
@@ -416,6 +468,59 @@ 

Source code for doctr.datasets.utils

 
     return encoded_data
+ + +def convert_target_to_relative( + img: ImageTensor, target: Union[np.ndarray, Dict[str, Any]] +) -> Tuple[ImageTensor, Union[Dict[str, Any], np.ndarray]]: + if isinstance(target, np.ndarray): + target = convert_to_relative_coords(target, get_img_shape(img)) + else: + target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img)) + return img, target + + +def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]: + """Crop a set of bounding boxes from an image + + Args: + ---- + img_path: path to the image + geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4) + + Returns: + ------- + a list of cropped images + """ + with Image.open(img_path) as pil_img: + img: np.ndarray = np.array(pil_img.convert("RGB")) + # Polygon + if geoms.ndim == 3 and geoms.shape[1:] == (4, 2): + return extract_rcrops(img, geoms.astype(dtype=int)) + if geoms.ndim == 2 and geoms.shape[1] == 4: + return extract_crops(img, geoms.astype(dtype=int)) + raise ValueError("Invalid geometry format") + + +def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.ndarray, Dict[str, List]]: + """Converts multiclass target to relative coordinates. + + Args: + ---- + img: Image + target: tuple of target polygons and their classes names + + Returns: + ------- + Image and dictionary of boxes, with class names as keys + """ + boxes = convert_to_relative_coords(target[0], get_img_shape(img)) + boxes_classes = target[1] + boxes_dict: Dict = {k: [] for k in sorted(set(boxes_classes))} + for k, poly in zip(boxes_classes, boxes): + boxes_dict[k].append(poly) + boxes_dict = {k: np.stack(v, axis=0) for k, v in boxes_dict.items()} + return img, boxes_dict
@@ -448,8 +553,8 @@

Source code for doctr.datasets.utils

       
     
   
- - + + diff --git a/v0.3.1/_modules/doctr/datasets/wildreceipt.html b/v0.3.1/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.3.1/_modules/doctr/datasets/wildreceipt.html +++ b/v0.3.1/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.3.1/_modules/doctr/documents/elements.html b/v0.3.1/_modules/doctr/documents/elements.html deleted file mode 100644 index 10c1e142d2..0000000000 --- a/v0.3.1/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,577 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox, RotatedBbox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator, arg-type] - - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/documents/reader.html b/v0.3.1/_modules/doctr/documents/reader.html deleted file mode 100644 index cdcd814b6c..0000000000 --- a/v0.3.1/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,612 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence, Dict
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/io/elements.html b/v0.3.1/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/v0.3.1/_modules/doctr/io/elements.html +++ b/v0.3.1/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.3.1/_modules/doctr/io/html.html b/v0.3.1/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/v0.3.1/_modules/doctr/io/html.html +++ b/v0.3.1/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.3.1/_modules/doctr/io/image/base.html b/v0.3.1/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/v0.3.1/_modules/doctr/io/image/base.html +++ b/v0.3.1/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.3.1/_modules/doctr/io/image/tensorflow.html b/v0.3.1/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/v0.3.1/_modules/doctr/io/image/tensorflow.html +++ b/v0.3.1/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.3.1/_modules/doctr/io/pdf.html b/v0.3.1/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/v0.3.1/_modules/doctr/io/pdf.html +++ b/v0.3.1/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.3.1/_modules/doctr/io/reader.html b/v0.3.1/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/v0.3.1/_modules/doctr/io/reader.html +++ b/v0.3.1/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.3.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/v0.3.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.3.1/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/v0.3.1/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.3.1/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/v0.3.1/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.3.1/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.3.1/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.3.1/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/v0.3.1/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.3.1/_modules/doctr/models/classification/vit/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/v0.3.1/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.3.1/_modules/doctr/models/classification/zoo.html b/v0.3.1/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/v0.3.1/_modules/doctr/models/classification/zoo.html +++ b/v0.3.1/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.3.1/_modules/doctr/models/detection/differentiable_binarization.html b/v0.3.1/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.3.1/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.3.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 9145c7c3fd..66cef8663d 100644 --- a/v0.3.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import List, Tuple, Optional, Any, Dict
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+from tensorflow.keras.applications import ResNet50
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
+
+from ...classification import mobilenet_v3_large
 from .base import DBPostProcessor, _DBNet
 
-__all__ = ['DBNet', 'db_resnet50']
+__all__ = ["DBNet", "db_resnet50", "db_mobilenet_v3_large"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
+    "db_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_resnet50-649fa22b.weights.h5&src=0",
+    },
+    "db_mobilenet_v3_large": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_mobilenet_v3_large-ee2e1dbe.weights.h5&src=0",
     },
 }
 
@@ -313,6 +348,7 @@ 

Source code for doctr.models.detection.differentiable_binarization.tensorflo <https://arxiv.org/pdf/1612.03144.pdf>`_. Args: + ---- channels: number of channel to output """ @@ -322,9 +358,9 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo ) -> None: super().__init__() self.channels = channels - self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest') - self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)] - self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)] + self.upsample = layers.UpSampling2D(size=(2, 2), interpolation="nearest") + self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer="he_normal") for _ in range(4)] + self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2**idx) for idx in range(4)] @staticmethod def build_upsampling( @@ -334,20 +370,21 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo """Module which performs a 3x3 convolution followed by up-sampling Args: + ---- channels: number of output channels dilation_factor (int): dilation factor to scale the convolution output before concatenation Returns: + ------- a keras.layers.Layer object, wrapping these operations in a sequential module """ - - _layers = conv_sequence(channels, 'relu', True, kernel_size=3) + _layers = conv_sequence(channels, "relu", True, kernel_size=3) if dilation_factor > 1: - _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest')) + _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation="nearest")) - module = keras.Sequential(_layers) + module = Sequential(_layers) return module @@ -359,7 +396,6 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo x: List[tf.Tensor], **kwargs: Any, ) -> tf.Tensor: - # Channel mapping results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)] # Upsample & sum @@ -371,200 +407,324 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo return layers.concatenate(results) -class DBNet(_DBNet, keras.Model, NestedObject): +class DBNet(_DBNet, Model, NestedObject): """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_. Args: + ---- feature extractor: the backbone serving as feature extractor fpn_channels: number of channels each extracted feature maps is mapped to + bin_thresh: threshold for binarization + box_thresh: minimal objectness score to consider a box + assume_straight_pages: if True, fit straight bounding boxes only + exportable: onnx exportable returns only logits + cfg: the configuration dict of the model + class_names: list of class names """ - _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "fpn", "probability_head", "threshold_head", "postprocessor"] def __init__( self, feature_extractor: IntermediateLayerGetter, - fpn_channels: int = 128, - rotated_bbox: bool = False, + fpn_channels: int = 128, # to be set to 256 to represent the author's initial idea + bin_thresh: float = 0.3, + box_thresh: float = 0.1, + assume_straight_pages: bool = True, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, + class_names: List[str] = [CLASS_NAME], ) -> None: - super().__init__() + self.class_names = class_names + num_classes: int = len(self.class_names) self.cfg = cfg self.feat_extractor = feature_extractor - self.rotated_bbox = rotated_bbox + self.exportable = exportable + self.assume_straight_pages = assume_straight_pages self.fpn = FeaturePyramidNetwork(channels=fpn_channels) # Initialize kernels _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape] output_shape = tuple(self.fpn(_inputs).shape) - self.probability_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] + self.probability_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + self.threshold_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + + self.postprocessor = DBPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh ) - self.threshold_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] - ) - - self.postprocessor = DBPostProcessor(rotated_bbox=rotated_bbox) def compute_loss( self, out_map: tf.Tensor, thresh_map: tf.Tensor, - target: List[Dict[str, Any]] + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes and a list of masks for each image. From there it computes the loss with the model output Args: + ---- out_map: output feature map of the model of shape (N, H, W, C) thresh_map: threshold map of shape (N, H, W, C) target: list of dictionary where each dict has a `boxes` and a `flags` entry + gamma: modulating factor in the focal loss formula + alpha: balancing factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") - prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1])) - thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1])) + prob_map = tf.math.sigmoid(out_map) + thresh_map = tf.math.sigmoid(thresh_map) - seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) + seg_target, seg_mask, thresh_target, thresh_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32) + seg_mask = tf.cast(seg_mask, tf.float32) + thresh_target = tf.convert_to_tensor(thresh_target, dtype=out_map.dtype) thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool) - # Compute balanced BCE loss for proba_map - bce_scale = 5. - bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask] - - neg_target = 1 - seg_target[seg_mask] - positive_count = tf.math.reduce_sum(seg_target[seg_mask]) - negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count]) - negative_loss = bce_loss * neg_target - negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32)) - sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss) - balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6) - - # Compute dice loss for approxbin_map - bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask]))) - - bce_min = tf.math.reduce_min(bce_loss) - weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1. - inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights) - union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8 - dice_loss = 1 - 2.0 * inter / union + # Focal loss + focal_scale = 10.0 + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + + # Convert logits to prob, compute gamma factor + p_t = (seg_target * prob_map) + ((1 - seg_target) * (1 - prob_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class or for approx binary_map + if len(self.class_names) > 1: + dice_map = tf.nn.softmax(out_map, axis=-1) + else: + # compute binary map instead + dice_map = 1.0 / (1.0 + tf.exp(-50 * (prob_map - thresh_map))) + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) # Compute l1 loss for thresh_map - l1_scale = 10. if tf.reduce_any(thresh_mask): - l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask])) + thresh_mask = tf.cast(thresh_mask, tf.float32) + l1_loss = tf.reduce_sum(tf.abs(thresh_map - thresh_target) * thresh_mask) / ( + tf.reduce_sum(thresh_mask) + eps + ) else: - l1_loss = tf.constant(0.) + l1_loss = tf.constant(0.0) - return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss + return l1_loss + focal_scale * focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - feat_maps = self.feat_extractor(x, **kwargs) feat_concat = self.fpn(feat_maps, **kwargs) logits = self.probability_head(feat_concat, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: - # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + if target is None or return_preds: + # Post-process boxes (keep only text predictions) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: thresh_map = self.threshold_head(feat_concat, **kwargs) loss = self.compute_loss(logits, thresh_map, target) - out['loss'] = loss + out["loss"] = loss return out -def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet: +def _db_resnet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) # Feature extractor - resnet = tf.keras.applications.__dict__[_cfg['backbone']]( - include_top=False, - weights=None, - input_shape=_cfg['input_shape'], - pooling=None, + feat_extractor = IntermediateLayerGetter( + backbone_fn( + weights="imagenet" if pretrained_backbone else None, + include_top=False, + pooling=None, + input_shape=_cfg["input_shape"], + ), + fpn_layers, ) + # Build the model + model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + + # Load pretrained parameters + if pretrained: + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) + + return model + + +def _db_mobilenet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained + + # Patch the config + _cfg = deepcopy(default_cfgs[arch]) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = default_cfgs[arch].get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor feat_extractor = IntermediateLayerGetter( - resnet, - _cfg['fpn_layers'], + backbone_fn( + input_shape=_cfg["input_shape"], + include_top=False, + pretrained=pretrained_backbone, + ), + fpn_layers, ) - kwargs['fpn_channels'] = _cfg['fpn_channels'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] - # Build the model model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model
-[docs] +[docs] def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture Returns: + ------- text detection architecture """ + return _db_resnet( + "db_resnet50", + pretrained, + ResNet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
+ + + +
+[docs] +def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet: + """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" + <https://arxiv.org/pdf/1911.08947.pdf>`_, using a mobilenet v3 large backbone. + + >>> import tensorflow as tf + >>> from doctr.models import db_mobilenet_v3_large + >>> model = db_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) - return _db_resnet('db_resnet50', pretrained, **kwargs)
+ Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture + + Returns: + ------- + text detection architecture + """ + return _db_mobilenet( + "db_mobilenet_v3_large", + pretrained, + mobilenet_v3_large, + ["inverted_2", "inverted_5", "inverted_11", "final_block"], + **kwargs, + )

@@ -598,8 +758,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo - - + + diff --git a/v0.3.1/_modules/doctr/models/detection/fast/tensorflow.html b/v0.3.1/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.3.1/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.3.1/_modules/doctr/models/detection/linknet.html b/v0.3.1/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.3.1/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.3.1/_modules/doctr/models/detection/linknet/tensorflow.html index cd4f446673..ce995f99d4 100644 --- a/v0.3.1/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.linknet.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.classification import resnet18, resnet34, resnet50
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.backbones import ResnetStage
-from doctr.models.utils import conv_sequence, load_pretrained_params
-from .base import LinkNetPostProcessor, _LinkNet
 
-__all__ = ['LinkNet', 'linknet16']
+from .base import LinkNetPostProcessor, _LinkNet
 
+__all__ = ["LinkNet", "linknet_resnet18", "linknet_resnet34", "linknet_resnet50"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet16': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'num_classes': 1,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': None,
+    "linknet_resnet18": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet18-615a82c5.weights.h5&src=0",
+    },
+    "linknet_resnet34": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet34-9d772be5.weights.h5&src=0",
+    },
+    "linknet_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet50-6bf6c8b5.weights.h5&src=0",
     },
 }
 
 
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
+def decoder_block(in_chan: int, out_chan: int, stride: int, **kwargs: Any) -> Sequential:
     """Creates a LinkNet decoder block"""
-
     return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
+        *conv_sequence(in_chan // 4, "relu", True, kernel_size=1, **kwargs),
         layers.Conv2DTranspose(
             filters=in_chan // 4,
             kernel_size=3,
-            strides=2,
+            strides=stride,
             padding="same",
             use_bias=False,
-            kernel_initializer='he_normal'
+            kernel_initializer="he_normal",
         ),
         layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
+        layers.Activation("relu"),
+        *conv_sequence(out_chan, "relu", True, kernel_size=1),
     ])
 
 
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module"""
+class LinkNetFPN(Model, NestedObject):
+    """LinkNet Decoder module"""
 
     def __init__(
         self,
+        out_chans: int,
+        in_shapes: List[Tuple[int, ...]],
     ) -> None:
-
         super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
+        self.out_chans = out_chans
+        strides = [2] * (len(in_shapes) - 1) + [1]
+        i_chans = [s[-1] for s in in_shapes[::-1]]
+        o_chans = i_chans[1:] + [out_chans]
+        self.decoders = [
+            decoder_block(in_chan, out_chan, s, input_shape=in_shape)
+            for in_chan, out_chan, s, in_shape in zip(i_chans, o_chans, strides, in_shapes[::-1])
+        ]
+
+    def call(self, x: List[tf.Tensor], **kwargs: Any) -> tf.Tensor:
+        out = 0
+        for decoder, fmap in zip(self.decoders, x[::-1]):
+            out = decoder(out + fmap, **kwargs)
+        return out
 
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(_LinkNet, keras.Model):
+    def extra_repr(self) -> str:
+        return f"out_chans={self.out_chans}"
+
+
+class LinkNet(_LinkNet, Model):
     """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
     <https://arxiv.org/pdf/1707.03718.pdf>`_.
 
     Args:
-        num_classes: number of channels for the output
+    ----
+        feature extractor: the backbone serving as feature extractor
+        fpn_channels: number of channels each extracted feature maps is mapped to
+        bin_thresh: threshold for binarization of the output feature map
+        box_thresh: minimal objectness score to consider a box
+        assume_straight_pages: if True, fit straight bounding boxes only
+        exportable: onnx exportable returns only logits
+        cfg: the configuration dict of the model
+        class_names: list of class names
     """
 
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
+    _children_names: List[str] = ["feat_extractor", "fpn", "classifier", "postprocessor"]
 
     def __init__(
         self,
-        num_classes: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        rotated_bbox: bool = False,
+        feat_extractor: IntermediateLayerGetter,
+        fpn_channels: int = 64,
+        bin_thresh: float = 0.1,
+        box_thresh: float = 0.1,
+        assume_straight_pages: bool = True,
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
+        class_names: List[str] = [CLASS_NAME],
     ) -> None:
         super().__init__(cfg=cfg)
 
-        self.rotated_bbox = rotated_bbox
+        self.class_names = class_names
+        num_classes: int = len(self.class_names)
 
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
+        self.exportable = exportable
+        self.assume_straight_pages = assume_straight_pages
+
+        self.feat_extractor = feat_extractor
 
-        self.fpn = LinkNetFPN()
+        self.fpn = LinkNetFPN(fpn_channels, [_shape[1:] for _shape in self.feat_extractor.output_shape])
+        self.fpn.build(self.feat_extractor.output_shape)
 
         self.classifier = Sequential([
             layers.Conv2DTranspose(
@@ -393,154 +442,246 @@ 

Source code for doctr.models.detection.linknet.tensorflow

strides=2, padding="same", use_bias=False, - kernel_initializer='he_normal' + kernel_initializer="he_normal", + input_shape=self.fpn.decoders[-1].output_shape[1:], ), layers.BatchNormalization(), - layers.Activation('relu'), - *conv_sequence(32, 'relu', True, strides=1, kernel_size=3), + layers.Activation("relu"), + *conv_sequence(32, "relu", True, kernel_size=3, strides=1), layers.Conv2DTranspose( filters=num_classes, kernel_size=2, strides=2, padding="same", - use_bias=False, - kernel_initializer='he_normal' + use_bias=True, + kernel_initializer="he_normal", ), ]) - self.postprocessor = LinkNetPostProcessor(rotated_bbox=rotated_bbox) + self.postprocessor = LinkNetPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh + ) def compute_loss( self, out_map: tf.Tensor, - target: List[Dict[str, Any]], - focal_loss: bool = False, - alpha: float = .5, - gamma: float = 2., - edge_factor: float = 2., + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute linknet loss, BCE with boosted box edges or focal loss. Focal loss implementation based on <https://github.com/tensorflow/addons/>`_. Args: + ---- out_map: output feature map of the model of shape N x H x W x 1 target: list of dictionary where each dict has a `boxes` and a `flags` entry - focal_loss: if True, use focal loss instead of BCE - edge_factor: boost factor for box edges (in case of BCE) + gamma: modulating factor in the focal loss formula alpha: balancing factor in the focal loss formula - gammma: modulating factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ - seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) - edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) + seg_target, seg_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - - # Get the cross_entropy for each entry - bce = tf.keras.losses.binary_crossentropy( - seg_target[seg_mask], - tf.squeeze(out_map, axis=[-1])[seg_mask], - from_logits=True) - - if focal_loss: - if gamma and gamma < 0: - raise ValueError("Value of gamma should be greater than or equal to zero.") - - # Convert logits to prob, compute gamma factor - pred_prob = tf.sigmoid(tf.squeeze(out_map, axis=[-1])[seg_mask]) - p_t = (seg_target[seg_mask] * pred_prob) + ((1 - seg_target[seg_mask]) * (1 - pred_prob)) - modulating_factor = tf.pow((1.0 - p_t), gamma) - - # Compute alpha factor - alpha_factor = seg_target[seg_mask] * alpha + (1 - seg_target[seg_mask]) * (1 - alpha) - - # compute the final loss - loss = tf.reduce_mean(alpha_factor * modulating_factor * bce) - - else: - # Compute BCE loss with highlighted edges - loss = tf.math.multiply( - 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), - bce - ) - loss = tf.reduce_mean(loss) - - return loss + seg_mask = tf.cast(seg_mask, tf.float32) + + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + proba_map = tf.sigmoid(out_map) + + # Focal loss + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") + # Convert logits to prob, compute gamma factor + p_t = (seg_target * proba_map) + ((1 - seg_target) * (1 - proba_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class + dice_map = tf.nn.softmax(out_map, axis=-1) if len(self.class_names) > 1 else proba_map + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) + + return focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, - focal_loss: bool = True, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - - logits = self.stem(x) - logits = self.fpn(logits) - logits = self.classifier(logits) + feat_maps = self.feat_extractor(x, **kwargs) + logits = self.fpn(feat_maps, **kwargs) + logits = self.classifier(logits, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) + if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: + if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: - loss = self.compute_loss(logits, target, focal_loss) - out['loss'] = loss + loss = self.compute_loss(logits, target) + out["loss"] = loss return out -def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet: +def _linknet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> LinkNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor + feat_extractor = IntermediateLayerGetter( + backbone_fn( + pretrained=pretrained_backbone, + include_top=False, + input_shape=_cfg["input_shape"], + ), + fpn_layers, + ) - kwargs['num_classes'] = _cfg['num_classes'] - kwargs['input_shape'] = _cfg['input_shape'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model - model = LinkNet(cfg=_cfg, **kwargs) + model = LinkNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model -
-[docs] -def linknet16(pretrained: bool = False, **kwargs: Any) -> LinkNet: +
+[docs] +def linknet_resnet18(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet18 + >>> model = linknet_resnet18(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture + + Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet18", + pretrained, + resnet18, + ["resnet_block_1", "resnet_block_3", "resnet_block_5", "resnet_block_7"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet34(pretrained: bool = False, **kwargs: Any) -> LinkNet: """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" <https://arxiv.org/pdf/1707.03718.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet16 - >>> model = linknet16(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet34 + >>> model = linknet_resnet34(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture Returns: + ------- text detection architecture """ + return _linknet( + "linknet_resnet34", + pretrained, + resnet34, + ["resnet_block_2", "resnet_block_6", "resnet_block_12", "resnet_block_15"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet50(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet50 + >>> model = linknet_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture - return _linknet('linknet16', pretrained, **kwargs)
+ Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet50", + pretrained, + resnet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
@@ -574,8 +715,8 @@

Source code for doctr.models.detection.linknet.tensorflow

- +
+ diff --git a/v0.3.1/_modules/doctr/models/detection/zoo.html b/v0.3.1/_modules/doctr/models/detection/zoo.html index d3128b8d14..3651c4e2d3 100644 --- a/v0.3.1/_modules/doctr/models/detection/zoo.html +++ b/v0.3.1/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
 from doctr.file_utils import is_tf_available, is_torch_available
-from .core import DetectionPredictor
-from ..preprocessor import PreProcessor
-from .. import detection
 
+from .. import detection
+from ..detection.fast import reparameterize
+from ..preprocessor import PreProcessor
+from .predictor import DetectionPredictor
 
 __all__ = ["detection_predictor"]
 
+ARCHS: List[str]
+
 
 if is_tf_available():
-    ARCHS = ['db_resnet50', 'linknet16']
+    ARCHS = [
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
 elif is_torch_available():
-    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
+    ARCHS = [
+        "db_resnet34",
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
+
 
+def _predictor(arch: Any, pretrained: bool, assume_straight_pages: bool = True, **kwargs: Any) -> DetectionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
+        _model = detection.__dict__[arch](
+            pretrained=pretrained,
+            pretrained_backbone=kwargs.get("pretrained_backbone", True),
+            assume_straight_pages=assume_straight_pages,
+        )
+        # Reparameterize FAST models by default to lower inference latency and memory usage
+        if isinstance(_model, detection.FAST):
+            _model = reparameterize(_model)
+    else:
+        if not isinstance(arch, (detection.DBNet, detection.LinkNet, detection.FAST)):
+            raise ValueError(f"unknown architecture: {type(arch)}")
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+        _model = arch
+        _model.assume_straight_pages = assume_straight_pages
+        _model.postprocessor.assume_straight_pages = assume_straight_pages
 
-    # Detection
-    _model = detection.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 1)
+    kwargs.pop("pretrained_backbone", None)
+
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 2)
     predictor = DetectionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
-        _model
+        PreProcessor(_model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:], **kwargs),
+        _model,
     )
     return predictor
 
 
 
-[docs] -def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: +[docs] +def detection_predictor( + arch: Any = "fast_base", + pretrained: bool = False, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + batch_size: int = 2, + **kwargs: Any, +) -> DetectionPredictor: """Text detection architecture. - Example:: - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> model = detection_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import detection_predictor + >>> model = detection_predictor(arch='db_resnet50', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_resnet50') + ---- + arch: name of the architecture or model itself to use (e.g. 'db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset + assume_straight_pages: If True, fit straight boxes to the page + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right + batch_size: number of samples the model processes in parallel + **kwargs: optional keyword arguments passed to the architecture Returns: + ------- Detection predictor """ - - return _predictor(arch, pretrained, **kwargs)
+ return _predictor( + arch=arch, + pretrained=pretrained, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + batch_size=batch_size, + **kwargs, + )
@@ -367,8 +449,8 @@

Source code for doctr.models.detection.zoo

       
     
   
- - + + diff --git a/v0.3.1/_modules/doctr/models/export.html b/v0.3.1/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.3.1/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/models/factory/hub.html b/v0.3.1/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/v0.3.1/_modules/doctr/models/factory/hub.html +++ b/v0.3.1/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.3.1/_modules/doctr/models/recognition/crnn.html b/v0.3.1/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.3.1/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.3.1/_modules/doctr/models/recognition/crnn/tensorflow.html index 41cc93dd23..bc64da9a1b 100644 --- a/v0.3.1/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.crnn.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
+
 import tensorflow as tf
 from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential, Model
-from typing import Tuple, Dict, Any, Optional, List
+from tensorflow.keras.models import Model, Sequential
+
+from doctr.datasets import VOCABS
 
-from ... import backbones
-from ...utils import load_pretrained_params
+from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
+__all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
+    "crnn_vgg16_bn": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["legacy_french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_vgg16_bn-9c188f45.weights.h5&src=0",
     },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
+    "crnn_mobilenet_v3_small": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_small-54850265.weights.h5&src=0",
+    },
+    "crnn_mobilenet_v3_large": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_large-c64045e5.weights.h5&src=0",
     },
 }
 
 
 class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
+    """Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
 
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
         ignore_case: if True, ignore case of letters
         ignore_accents: if True, ignore accents of letters
@@ -325,37 +353,57 @@ 

Source code for doctr.models.recognition.crnn.tensorflow

def __call__( self, - logits: tf.Tensor - ) -> List[Tuple[str, float]]: - """ - Performs decoding of raw output with CTC and decoding of CTC predictions + logits: tf.Tensor, + beam_width: int = 1, + top_paths: int = 1, + ) -> Union[List[Tuple[str, float]], List[Tuple[List[str], List[float]]]]: + """Performs decoding of raw output with CTC and decoding of CTC predictions with label_to_idx mapping dictionnary Args: + ---- logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1 + beam_width: An int scalar >= 0 (beam search beam width). + top_paths: An int scalar >= 0, <= beam_width (controls output size). Returns: + ------- A list of decoded words of length BATCH_SIZE + """ # Decode CTC _decoded, _log_prob = tf.nn.ctc_beam_search_decoder( tf.transpose(logits, perm=[1, 0, 2]), - tf.fill(logits.shape[0], logits.shape[1]), - beam_width=1, top_paths=1, + tf.fill(tf.shape(logits)[:1], tf.shape(logits)[1]), + beam_width=beam_width, + top_paths=top_paths, ) - out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab)) - probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) + + _decoded = tf.sparse.concat( + 1, + [tf.sparse.expand_dims(dec, axis=1) for dec in _decoded], + expand_nonconcat_dims=True, + ) # dim : batchsize x beamwidth x actual_max_len_predictions + out_idxs = tf.sparse.to_dense(_decoded, default_value=len(self.vocab)) # Map it to characters _decoded_strings_pred = tf.strings.reduce_join( inputs=tf.nn.embedding_lookup(tf.constant(self._embedding, dtype=tf.string), out_idxs), - axis=-1 + axis=-1, ) _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] - word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - + decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value="not valid")[ + :, :, 0 + ] # dim : batch_size x beam_width + + if top_paths == 1: + probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) # dim : batchsize + decoded_strings_pred = tf.squeeze(decoded_strings_pred, axis=1) + word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] + else: + probs = tf.math.exp(_log_prob) # dim : batchsize x beamwidth + word_values = [[word.decode() for word in words] for words in decoded_strings_pred.numpy().tolist()] return list(zip(word_values, probs.numpy().tolist())) @@ -364,19 +412,26 @@

Source code for doctr.models.recognition.crnn.tensorflow

Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of units in the LSTM layers + exportable: onnx exportable returns only logits + beam_width: beam width for beam search decoding + top_paths: number of top paths for beam search decoding cfg: configuration dictionary """ - _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "decoder", "postprocessor"] def __init__( self, - feature_extractor: tf.keras.Model, + feature_extractor: Model, vocab: str, rnn_units: int = 128, + exportable: bool = False, + beam_width: int = 1, + top_paths: int = 1, cfg: Optional[Dict[str, Any]] = None, ) -> None: # Initialize kernels @@ -386,19 +441,21 @@

Source code for doctr.models.recognition.crnn.tensorflow

self.vocab = vocab self.max_length = w self.cfg = cfg + self.exportable = exportable self.feat_extractor = feature_extractor - self.decoder = Sequential( - [ - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Dense(units=len(vocab) + 1) - ] - ) + self.decoder = Sequential([ + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Dense(units=len(vocab) + 1), + ]) self.decoder.build(input_shape=(None, w, h * c)) self.postprocessor = CTCPostProcessor(vocab=vocab) + self.beam_width = beam_width + self.top_paths = top_paths + def compute_loss( self, model_output: tf.Tensor, @@ -407,16 +464,17 @@

Source code for doctr.models.recognition.crnn.tensorflow

"""Compute CTC loss for the model. Args: - gt: the encoded tensor with gt labels + ---- model_output: predicted logits of the model - seq_len: lengths of each gt word inside the batch + target: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) batch_len = model_output.shape[0] - input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) + input_length = tf.fill((batch_len,), model_output.shape[1]) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -428,8 +486,12 @@

Source code for doctr.models.recognition.crnn.tensorflow

target: Optional[List[str]] = None, return_model_output: bool = False, return_preds: bool = False, + beam_width: int = 1, + top_paths: int = 1, **kwargs: Any, ) -> Dict[str, Any]: + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") features = self.feat_extractor(x, **kwargs) # B x H x W x C --> B x W x H x C @@ -437,91 +499,132 @@

Source code for doctr.models.recognition.crnn.tensorflow

w, h, c = transposed_feat.get_shape().as_list()[1:] # B x W x H x C --> B x W x H * C features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c)) - logits = self.decoder(features_seq, **kwargs) + logits = _bf16_to_float32(self.decoder(features_seq, **kwargs)) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = logits + return out + if return_model_output: out["out_map"] = logits if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(logits) + out["preds"] = self.postprocessor(logits, beam_width=beam_width, top_paths=top_paths) if target is not None: - out['loss'] = self.compute_loss(logits, target) + out["loss"] = self.compute_loss(logits, target) return out -def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: +def _crnn( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> CRNN: + pretrained_backbone = pretrained_backbone and not pretrained + + kwargs["vocab"] = kwargs.get("vocab", default_cfgs[arch]["vocab"]) - # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) + _cfg["vocab"] = kwargs["vocab"] + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] - # Feature extractor - feat_extractor = backbones.__dict__[_cfg['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + input_shape=_cfg["input_shape"], include_top=False, + pretrained=pretrained_backbone, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - # Build the model model = CRNN(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params(model, _cfg["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]) return model
-[docs] +[docs] def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_vgg16_bn + >>> model = crnn_vgg16_bn(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_vgg16_bn", pretrained, vgg16_bn_r, **kwargs)
+ + + +
+[docs] +def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based + Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_small + >>> model = crnn_mobilenet_v3_small(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
+ Returns: + ------- + text recognition architecture + """ + return _crnn("crnn_mobilenet_v3_small", pretrained, mobilenet_v3_small_r, **kwargs)
-def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based +
+[docs] +def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_large + >>> model = crnn_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_mobilenet_v3_large", pretrained, mobilenet_v3_large_r, **kwargs)
- return _crnn('crnn_resnet31', pretrained, **kwargs)
@@ -554,8 +657,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

- +
+ diff --git a/v0.3.1/_modules/doctr/models/recognition/master/tensorflow.html b/v0.3.1/_modules/doctr/models/recognition/master/tensorflow.html index 2dc5a27717..aa6aa69325 100644 --- a/v0.3.1/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.master.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import tensorflow as tf
-from tensorflow.keras import layers, Sequential, Model
-from typing import Tuple, List, Dict, Any, Optional
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
 
-from ..core import RecognitionPostProcessor
-from ...backbones.resnet import ResnetStage
-from ...utils import conv_sequence, load_pretrained_params
-from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask
-from ....datasets import VOCABS
-from .base import _MASTER, _MASTERPostProcessor
+import tensorflow as tf
+from tensorflow.keras import Model, layers
+
+from doctr.datasets import VOCABS
+from doctr.models.classification import magc_resnet31
+from doctr.models.modules.transformer import Decoder, PositionalEncoding
 
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from .base import _MASTER, _MASTERPostProcessor
 
-__all__ = ['MASTER', 'master', 'MASTERPostProcessor']
+__all__ = ["MASTER", "master"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'master': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'input_shape': (48, 160, 3),
-        'vocab': VOCABS['french'],
-        'url': None,
+    "master": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/master-d7fdaeff.weights.h5&src=0",
     },
 }
 
 
-class MAGC(layers.Layer):
-
-    """Implements the Multi-Aspect Global Context Attention, as described in
-    <https://arxiv.org/pdf/1910.02562.pdf>`_.
-
-    Args:
-        inplanes: input channels
-        headers: number of headers to split channels
-        att_scale: if True, re-scale attention to counteract the variance distibutions
-        **kwargs
-    """
-
-    def __init__(
-        self,
-        inplanes: int,
-        headers: int = 1,
-        att_scale: bool = False,
-        **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-
-        self.headers = headers  # h
-        self.inplanes = inplanes  # C
-        self.att_scale = att_scale
-
-        self.single_header_inplanes = int(inplanes / headers)  # C / h
-
-        self.conv_mask = tf.keras.layers.Conv2D(
-            filters=1,
-            kernel_size=1,
-            kernel_initializer=tf.initializers.he_normal()
-        )
-
-        self.transform = tf.keras.Sequential(
-            [
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-                tf.keras.layers.LayerNormalization([1, 2, 3]),
-                tf.keras.layers.ReLU(),
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-            ],
-            name='transform'
-        )
-
-    @tf.function
-    def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor:
-        b, h, w, c = (tf.shape(inputs)[i] for i in range(4))
-
-        # B, H, W, C -->> B*h, H, W, C/h
-        x = tf.reshape(inputs, shape=(b, h, w, self.headers, self.single_header_inplanes))
-        x = tf.transpose(x, perm=(0, 3, 1, 2, 4))
-        x = tf.reshape(x, shape=(b * self.headers, h, w, self.single_header_inplanes))
-
-        # Compute shorcut
-        shortcut = x
-        # B*h, 1, H*W, C/h
-        shortcut = tf.reshape(shortcut, shape=(b * self.headers, 1, h * w, self.single_header_inplanes))
-        # B*h, 1, C/h, H*W
-        shortcut = tf.transpose(shortcut, perm=[0, 1, 3, 2])
-
-        # Compute context mask
-        # B*h, H, W, 1,
-        context_mask = self.conv_mask(x)
-        # B*h, 1, H*W, 1
-        context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1))
-        # scale variance
-        if self.att_scale and self.headers > 1:
-            context_mask = context_mask / tf.sqrt(self.single_header_inplanes)
-        # B*h, 1, H*W, 1
-        context_mask = tf.keras.activations.softmax(context_mask, axis=2)
-
-        # Compute context
-        # B*h, 1, C/h, 1
-        context = tf.matmul(shortcut, context_mask)
-        context = tf.reshape(context, shape=(b, 1, c, 1))
-        # B, 1, 1, C
-        context = tf.transpose(context, perm=(0, 1, 3, 2))
-        # Set shape to resolve shape when calling this module in the Sequential MAGCResnet
-        batch, chan = inputs.get_shape().as_list()[0], inputs.get_shape().as_list()[-1]
-        context.set_shape([batch, 1, 1, chan])
-        return context
-
-    def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
-        # Context modeling: B, H, W, C  ->  B, 1, 1, C
-        context = self.context_modeling(inputs)
-        # Transform: B, 1, 1, C  ->  B, 1, 1, C
-        transformed = self.transform(context)
-        return inputs + transformed
-
-
-class MAGCResnet(Sequential):
-
-    """Implements the modified resnet with MAGC layers, as described in paper.
-
-    Args:
-        headers: number of header to split channels in MAGC layers
-        input_shape: shape of the model input (without batch dim)
-    """
-
-    def __init__(
-        self,
-        headers: int = 1,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
-    ) -> None:
-        _layers = [
-            # conv_1x
-            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
-            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_2x
-            ResnetStage(num_blocks=1, output_channels=256),
-            MAGC(inplanes=256, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=256, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_3x
-            ResnetStage(num_blocks=2, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 1), (2, 1)),
-            # conv_4x
-            ResnetStage(num_blocks=5, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            # conv_5x
-            ResnetStage(num_blocks=3, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-        ]
-        super().__init__(_layers)
-
-
 class MASTER(_MASTER, Model):
-
     """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
     Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_.
 
     Args:
+    ----
+        feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary, (without EOS, SOS, PAD)
         d_model: d parameter for the transformer decoder
-        headers: headers for the MAGC module
         dff: depth of the pointwise feed-forward layer
         num_heads: number of heads for the mutli-head attention module
         num_layers: number of decoder layers to stack
         max_length: maximum length of character sequence handled by the model
-        input_size: size of the image inputs
+        dropout: dropout probability of the decoder
+        input_shape: size of the image inputs
+        exportable: onnx exportable returns only logits
+        cfg: dictionary containing information about the model
     """
 
     def __init__(
         self,
+        feature_extractor: Model,
         vocab: str,
         d_model: int = 512,
-        headers: int = 1,
         dff: int = 2048,
-        num_heads: int = 8,
+        num_heads: int = 8,  # number of heads in the transformer decoder
         num_layers: int = 3,
         max_length: int = 50,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
+        dropout: float = 0.2,
+        input_shape: Tuple[int, int, int] = (32, 128, 3),  # different from the paper
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__()
 
-        self.vocab = vocab
+        self.exportable = exportable
         self.max_length = max_length
+        self.d_model = d_model
+        self.vocab = vocab
         self.cfg = cfg
         self.vocab_size = len(vocab)
 
-        self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape)
-        self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model)  # 3 more classes: EOS/PAD/SOS
+        self.feat_extractor = feature_extractor
+        self.positional_encoding = PositionalEncoding(self.d_model, dropout, max_len=input_shape[0] * input_shape[1])
 
         self.decoder = Decoder(
             num_layers=num_layers,
-            d_model=d_model,
+            d_model=self.d_model,
             num_heads=num_heads,
+            vocab_size=self.vocab_size + 3,  # EOS, SOS, PAD
             dff=dff,
-            vocab_size=self.vocab_size,
-            maximum_position_encoding=max_length,
+            dropout=dropout,
+            maximum_position_encoding=self.max_length,
         )
-        self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model)
-        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
 
+        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
         self.postprocessor = MASTERPostProcessor(vocab=self.vocab)
 
     @tf.function
-    def make_mask(self, target: tf.Tensor) -> tf.Tensor:
-        look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
-        target_padding_mask = create_padding_mask(target, self.vocab_size + 2)  # Pad symbol
-        combined_mask = tf.maximum(target_padding_mask, look_ahead_mask)
-        return combined_mask
+    def make_source_and_target_mask(self, source: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+        # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
+        # (N, 1, 1, max_length)
+        target_pad_mask = tf.cast(tf.math.not_equal(target, self.vocab_size + 2), dtype=tf.uint8)
+        target_pad_mask = target_pad_mask[:, tf.newaxis, tf.newaxis, :]
+        target_length = target.shape[1]
+        # sub mask filled diagonal with 1 = see 0 = masked (max_length, max_length)
+        target_sub_mask = tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
+        # source mask filled with ones (max_length, positional_encoded_seq_len)
+        source_mask = tf.ones((target_length, source.shape[1]))
+        # combine the two masks into one boolean mask where False is masked (N, 1, max_length, max_length)
+        target_mask = tf.math.logical_and(
+            tf.cast(target_sub_mask, dtype=tf.bool), tf.cast(target_pad_mask, dtype=tf.bool)
+        )
+        return source_mask, target_mask
 
+    @staticmethod
     def compute_loss(
-        self,
         model_output: tf.Tensor,
         gt: tf.Tensor,
         seq_len: List[int],
@@ -512,11 +413,13 @@ 

Source code for doctr.models.recognition.master.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -532,7 +435,7 @@

Source code for doctr.models.recognition.master.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) @@ -547,94 +450,103 @@

Source code for doctr.models.recognition.master.tensorflow

"""Call function for training Args: + ---- x: images target: list of str labels return_model_output: if True, return logits return_preds: if True, decode logits + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A dictionnary containing eventually loss, logits and predictions. """ - # Encode - feature = self.feature_extractor(x, **kwargs) - b, h, w, c = (tf.shape(feature)[i] for i in range(4)) + feature = self.feat_extractor(x, **kwargs) + b, h, w, c = feature.get_shape() + # (N, H, W, C) --> (N, H * W, C) feature = tf.reshape(feature, shape=(b, h * w, c)) - encoded = feature + self.feature_pe[:, :h * w, :] + # add positional encoding to features + encoded = self.positional_encoding(feature, **kwargs) out: Dict[str, tf.Tensor] = {} + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") + if target is not None: # Compute target: tensor of gts and sequence lengths - gt, seq_len = self.compute_target(target) - - if kwargs.get('training', False): - if target is None: - raise AssertionError("In training mode, you need to pass a value to 'target'") - tgt_mask = self.make_mask(gt) + gt, seq_len = self.build_target(target) + # Compute decoder masks + source_mask, target_mask = self.make_source_and_target_mask(encoded, gt) # Compute logits - output = self.decoder(gt, encoded, tgt_mask, None, **kwargs) + output = self.decoder(gt, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) - else: - # When not training, we want to compute logits in with the decoder, although - # we have access to gts (we need gts to compute the loss, but not in the decoder) logits = self.decode(encoded, **kwargs) + logits = _bf16_to_float32(logits) + + if self.exportable: + out["logits"] = logits + return out + if target is not None: - out['loss'] = self.compute_loss(logits, gt, seq_len) + out["loss"] = self.compute_loss(logits, gt, seq_len) if return_model_output: - out['out_map'] = logits + out["out_map"] = logits if return_preds: - predictions = self.postprocessor(logits) - out['preds'] = predictions + out["preds"] = self.postprocessor(logits) return out + @tf.function def decode(self, encoded: tf.Tensor, **kwargs: Any) -> tf.Tensor: """Decode function for prediction Args: + ---- encoded: encoded features + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A Tuple of tf.Tensor: predictions, logits """ - b = tf.shape(encoded)[0] - max_len = tf.constant(self.max_length, dtype=tf.int32) + b = encoded.shape[0] + start_symbol = tf.constant(self.vocab_size + 1, dtype=tf.int32) # SOS padding_symbol = tf.constant(self.vocab_size + 2, dtype=tf.int32) # PAD - ys = tf.fill(dims=(b, max_len - 1), value=padding_symbol) + ys = tf.fill(dims=(b, self.max_length - 1), value=padding_symbol) start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols - # max_len = len + 2 (sos + eos) + # Final dimension include EOS/SOS/PAD for i in range(self.max_length - 1): - ys_mask = self.make_mask(ys) - output = self.decoder(ys, encoded, ys_mask, None, **kwargs) + source_mask, target_mask = self.make_source_and_target_mask(encoded, ys) + output = self.decoder(ys, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) prob = tf.nn.softmax(logits, axis=-1) - next_word = tf.argmax(prob, axis=-1, output_type=ys.dtype) - # ys.shape = B, T - i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(max_len), indexing='ij') + next_token = tf.argmax(prob, axis=-1, output_type=ys.dtype) + # update ys with the next token and ignore the first token (SOS) + i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(self.max_length), indexing="ij") indices = tf.stack([i_mesh[:, i + 1], j_mesh[:, i + 1]], axis=1) - ys = tf.tensor_scatter_nd_update(ys, indices, next_word[:, i + 1]) + ys = tf.tensor_scatter_nd_update(ys, indices, next_token[:, i]) - # final_logits of shape (N, max_length - 1, vocab_size + 1) (whithout sos) + # Shape (N, max_length, vocab_size + 1) return logits class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures + Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -649,51 +561,66 @@

Source code for doctr.models.recognition.master.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _master(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> MASTER: +def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool = True, **kwargs: Any) -> MASTER: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"]) + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) - kwargs['vocab'] = _cfg['vocab'] + kwargs["vocab"] = _cfg["vocab"] + kwargs["input_shape"] = _cfg["input_shape"] # Build the model - model = MASTER(cfg=_cfg, **kwargs) + model = MASTER( + backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg["input_shape"], include_top=False), + cfg=_cfg, + **kwargs, + ) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model
-[docs] +[docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import master - >>> model = master(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + + >>> import tensorflow as tf + >>> from doctr.models import master + >>> model = master(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keywoard arguments passed to the MASTER architecture + Returns: + ------- text recognition architecture """ - - return _master('master', pretrained, **kwargs)
+ return _master("master", pretrained, magc_resnet31, **kwargs)
@@ -727,8 +654,8 @@

Source code for doctr.models.recognition.master.tensorflow

- +
+ diff --git a/v0.3.1/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.3.1/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/v0.3.1/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.3.1/_modules/doctr/models/recognition/sar.html b/v0.3.1/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.3.1/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.3.1/_modules/doctr/models/recognition/sar/tensorflow.html index e514e4f0c4..4a591e6451 100644 --- a/v0.3.1/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.sar.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
+
 import tensorflow as tf
-from tensorflow.keras import Sequential, layers, Model
-from typing import Tuple, Dict, List, Any, Optional
+from tensorflow.keras import Model, Sequential, layers
 
-from ... import backbones
-from ...utils import load_pretrained_params
-from ..core import RecognitionModel, RecognitionPostProcessor
+from doctr.datasets import VOCABS
 from doctr.utils.repr import NestedObject
 
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
+from ...classification import resnet31
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from ..core import RecognitionModel, RecognitionPostProcessor
+
+__all__ = ["SAR", "sar_resnet31"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
+    "sar_resnet31": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/sar_resnet31-5a58806c.weights.h5&src=0",
     },
 }
 
 
+class SAREncoder(layers.Layer, NestedObject):
+    """Implements encoder module of the SAR model
+
+    Args:
+    ----
+        rnn_units: number of hidden rnn units
+        dropout_prob: dropout probability
+    """
+
+    def __init__(self, rnn_units: int, dropout_prob: float = 0.0) -> None:
+        super().__init__()
+        self.rnn = Sequential([
+            layers.LSTM(units=rnn_units, return_sequences=True, recurrent_dropout=dropout_prob),
+            layers.LSTM(units=rnn_units, return_sequences=False, recurrent_dropout=dropout_prob),
+        ])
+
+    def call(
+        self,
+        x: tf.Tensor,
+        **kwargs: Any,
+    ) -> tf.Tensor:
+        # (N, C)
+        return self.rnn(x, **kwargs)
+
+
 class AttentionModule(layers.Layer, NestedObject):
     """Implements attention module of the SAR model
 
     Args:
+    ----
         attention_units: number of hidden attention units
 
     """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
 
+    def __init__(self, attention_units: int) -> None:
         super().__init__()
         self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            3,
+            strides=1,
+            use_bias=True,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
+            1,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.flatten = layers.Flatten()
 
@@ -343,12 +395,12 @@ 

Source code for doctr.models.recognition.sar.tensorflow

hidden_state: tf.Tensor, **kwargs: Any, ) -> tf.Tensor: - [H, W] = features.get_shape().as_list()[1:3] - # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) - hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) # shape (N, H, W, vgg_units) -> (N, H, W, attention_units) features_projection = self.features_projector(features, **kwargs) + # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) + hidden_state = tf.expand_dims(tf.expand_dims(hidden_state, axis=1), axis=1) + hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) projection = tf.math.tanh(hidden_state_projection + features_projection) # shape (N, H, W, attention_units) -> (N, H, W, 1) attention = self.attention_projector(projection, **kwargs) @@ -358,23 +410,25 @@

Source code for doctr.models.recognition.sar.tensorflow

# shape (N, H * W) -> (N, H, W, 1) attention_map = tf.reshape(attention, [-1, H, W, 1]) glimpse = tf.math.multiply(features, attention_map) - # shape (N, H * W) -> (N, 1) - glimpse = tf.reduce_sum(glimpse, axis=[1, 2]) - return glimpse + # shape (N, H * W) -> (N, C) + return tf.reduce_sum(glimpse, axis=[1, 2]) class SARDecoder(layers.Layer, NestedObject): """Implements decoder module of the SAR model Args: + ---- rnn_units: number of hidden units in recurrent cells max_length: maximum length of a sequence vocab_size: number of classes in the model alphabet embedding_units: number of hidden embedding units attention_units: number of hidden attention units - num_decoder_layers: number of LSTM layers to stack + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability """ + def __init__( self, rnn_units: int, @@ -382,23 +436,22 @@

Source code for doctr.models.recognition.sar.tensorflow

vocab_size: int, embedding_units: int, attention_units: int, - num_decoder_layers: int = 2, - input_shape: Optional[List[Tuple[Optional[int]]]] = None, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, ) -> None: - super().__init__() self.vocab_size = vocab_size - self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] - ) - self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) - self.attention_module = AttentionModule(attention_units) - self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units)) self.max_length = max_length - # Initialize kernels - if input_shape is not None: - self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units))) + self.embed = layers.Dense(embedding_units, use_bias=False) + self.embed_tgt = layers.Embedding(embedding_units, self.vocab_size + 1) + + self.lstm_cells = layers.StackedRNNCells([ + layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_cells) + ]) + self.attention_module = AttentionModule(attention_units) + self.output_dense = layers.Dense(self.vocab_size + 1, use_bias=True) + self.dropout = layers.Dropout(dropout_prob) def call( self, @@ -407,40 +460,47 @@

Source code for doctr.models.recognition.sar.tensorflow

gt: Optional[tf.Tensor] = None, **kwargs: Any, ) -> tf.Tensor: - - # initialize states (each of shape (N, rnn_units)) - states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=tf.float32 - ) - # run first step of lstm - # holistic: shape (N, rnn_units) - _, states = self.lstm_decoder(holistic, states, **kwargs) - # Initialize with the index of virtual START symbol (placed after <eos>) - symbol = tf.fill(features.shape[0], self.vocab_size + 1) - logits_list = [] - if kwargs.get('training') and gt is None: - raise ValueError('Need to provide labels during training for teacher forcing') - for t in range(self.max_length + 1): # keep 1 step for <eos> - # one-hot symbol with depth vocab_size + 1 - # embeded_symbol: shape (N, embedding_units) - embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs) - logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs) - glimpse = self.attention_module( - features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs, - ) - # logits: shape (N, rnn_units), glimpse: shape (N, 1) - logits = tf.concat([logits, glimpse], axis=-1) - # shape (N, rnn_units + 1) -> (N, vocab_size + 1) - logits = self.output_dense(logits, **kwargs) - # update symbol with predicted logits for t+1 step - if kwargs.get('training'): - symbol = gt[:, t] # type: ignore[index] + if gt is not None: + gt_embedding = self.embed_tgt(gt, **kwargs) + + logits_list: List[tf.Tensor] = [] + + for t in range(self.max_length + 1): # 32 + if t == 0: + # step to init the first states of the LSTMCell + states = self.lstm_cells.get_initial_state( + inputs=None, batch_size=features.shape[0], dtype=features.dtype + ) + prev_symbol = holistic + elif t == 1: + # step to init a 'blank' sequence of length vocab_size + 1 filled with zeros + # (N, vocab_size + 1) --> (N, embedding_units) + prev_symbol = tf.zeros([features.shape[0], self.vocab_size + 1], dtype=features.dtype) + prev_symbol = self.embed(prev_symbol, **kwargs) else: - symbol = tf.argmax(logits, axis=-1) - logits_list.append(logits) - outputs = tf.stack(logits_list, axis=1) # shape (N, max_length + 1, vocab_size + 1) - - return outputs + if gt is not None and kwargs.get("training", False): + # (N, embedding_units) -2 because of <bos> and <eos> (same) + prev_symbol = self.embed(gt_embedding[:, t - 2], **kwargs) + else: + # -1 to start at timestep where prev_symbol was initialized + index = tf.argmax(logits_list[t - 1], axis=-1) + # update prev_symbol with ones at the index of the previous logit vector + prev_symbol = self.embed(self.embed_tgt(index, **kwargs), **kwargs) + + # (N, C), (N, C) take the last hidden state and cell state from current timestep + _, states = self.lstm_cells(prev_symbol, states, **kwargs) + # states = (hidden_state, cell_state) + hidden_state = states[0][0] + # (N, H, W, C), (N, C) --> (N, C) + glimpse = self.attention_module(features, hidden_state, **kwargs) + # (N, C), (N, C) --> (N, 2 * C) + logits = tf.concat([hidden_state, glimpse], axis=1) + logits = self.dropout(logits, **kwargs) + # (N, vocab_size + 1) + logits_list.append(self.output_dense(logits, **kwargs)) + + # (max_length + 1, N, vocab_size + 1) --> (N, max_length + 1, vocab_size + 1) + return tf.transpose(tf.stack(logits_list[1:]), (1, 0, 2)) class SAR(Model, RecognitionModel): @@ -448,17 +508,20 @@

Source code for doctr.models.recognition.sar.tensorflow

Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of hidden units in both encoder and decoder LSTM embedding_units: number of embedding units attention_units: number of hidden units in attention module max_length: maximum word length handled by the model - num_decoders: number of LSTM to stack in decoder layer - + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability for the encoder and decoder + exportable: onnx exportable returns only logits + cfg: dictionary containing information about the model """ - _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "encoder", "decoder", "postprocessor"] def __init__( self, @@ -468,36 +531,34 @@

Source code for doctr.models.recognition.sar.tensorflow

embedding_units: int = 512, attention_units: int = 512, max_length: int = 30, - num_decoders: int = 2, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: - super().__init__() self.vocab = vocab + self.exportable = exportable self.cfg = cfg - self.max_length = max_length + 1 # Add 1 timestep for EOS after the longest word self.feat_extractor = feature_extractor - self.encoder = Sequential( - [ - layers.LSTM(units=rnn_units, return_sequences=True), - layers.LSTM(units=rnn_units, return_sequences=False) - ] - ) - # Initialize the kernels (watch out for reduce_max) - self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:]) - + self.encoder = SAREncoder(rnn_units, dropout_prob) self.decoder = SARDecoder( - rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders, - input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape] + rnn_units, + self.max_length, + len(vocab), + embedding_units, + attention_units, + num_decoder_cells, + dropout_prob, ) self.postprocessor = SARPostProcessor(vocab=vocab) + @staticmethod def compute_loss( - self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: tf.Tensor, @@ -506,11 +567,13 @@

Source code for doctr.models.recognition.sar.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -525,7 +588,7 @@

Source code for doctr.models.recognition.sar.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -536,16 +599,28 @@

Source code for doctr.models.recognition.sar.tensorflow

return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - features = self.feat_extractor(x, **kwargs) - pooled_features = tf.reduce_max(features, axis=1) # vertical max pooling + # vertical max pooling --> (N, C, W) + pooled_features = tf.reduce_max(features, axis=1) + # holistic (N, C) encoded = self.encoder(pooled_features, **kwargs) + if target is not None: - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) seq_len = tf.cast(seq_len, tf.int32) - decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training for teacher forcing") + + decoded_features = _bf16_to_float32( + self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + ) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = decoded_features + return out + if return_model_output: out["out_map"] = decoded_features @@ -554,7 +629,7 @@

Source code for doctr.models.recognition.sar.tensorflow

out["preds"] = self.postprocessor(decoded_features) if target is not None: - out['loss'] = self.compute_loss(decoded_features, gt, seq_len) + out["loss"] = self.compute_loss(decoded_features, gt, seq_len) return out @@ -563,9 +638,8 @@

Source code for doctr.models.recognition.sar.tensorflow

"""Post processor for SAR architectures Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -580,95 +654,75 @@

Source code for doctr.models.recognition.sar.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: +def _sar( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> SAR: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) - _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units']) - _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units']) - _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length']) - _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) # Feature extractor - feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + pretrained=pretrained_backbone, + input_shape=_cfg["input_shape"], include_top=False, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - kwargs['embedding_units'] = _cfg['embedding_units'] - kwargs['attention_units'] = _cfg['attention_units'] - kwargs['max_length'] = _cfg['max_length'] - kwargs['num_decoders'] = _cfg['num_decoders'] + kwargs["vocab"] = _cfg["vocab"] # Build the model model = SAR(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model -
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - -
-[docs] +[docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import sar_resnet31 + >>> model = sar_resnet31(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the SAR architecture Returns: + ------- text recognition architecture """ - - return _sar('sar_resnet31', pretrained, **kwargs)
+ return _sar("sar_resnet31", pretrained, resnet31, **kwargs)
@@ -702,8 +756,8 @@

Source code for doctr.models.recognition.sar.tensorflow

- +
+ diff --git a/v0.3.1/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.3.1/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/v0.3.1/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.3.1/_modules/doctr/models/recognition/zoo.html b/v0.3.1/_modules/doctr/models/recognition/zoo.html index bf0ae6af6e..f664304019 100644 --- a/v0.3.1/_modules/doctr/models/recognition/zoo.html +++ b/v0.3.1/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
-from doctr.file_utils import is_tf_available, is_torch_available
-from .core import RecognitionPredictor
-from ..preprocessor import PreProcessor
-from .. import recognition
+from doctr.file_utils import is_tf_available
+from doctr.models.preprocessor import PreProcessor
 
+from .. import recognition
+from .predictor import RecognitionPredictor
 
 __all__ = ["recognition_predictor"]
 
 
-if is_tf_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
-elif is_torch_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
+ARCHS: List[str] = [
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "sar_resnet31",
+    "master",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
+]
+
 
+def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+        _model = recognition.__dict__[arch](
+            pretrained=pretrained, pretrained_backbone=kwargs.get("pretrained_backbone", True)
+        )
+    else:
+        if not isinstance(
+            arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq)
+        ):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+        _model = arch
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+    kwargs.pop("pretrained_backbone", None)
 
-    _model = recognition.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 32)
-    predictor = RecognitionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
-        _model
-    )
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 128)
+    input_shape = _model.cfg["input_shape"][:2] if is_tf_available() else _model.cfg["input_shape"][-2:]
+    predictor = RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs), _model)
 
     return predictor
 
 
 
-[docs] -def recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor: +[docs] +def recognition_predictor( + arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + symmetric_pad: bool = False, + batch_size: int = 128, + **kwargs: Any, +) -> RecognitionPredictor: """Text recognition architecture. Example:: @@ -326,14 +369,18 @@

Source code for doctr.models.recognition.zoo

        >>> out = model([input_page])
 
     Args:
-        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
+    ----
+        arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
         pretrained: If True, returns a model pre-trained on our text recognition dataset
+        symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right
+        batch_size: number of samples the model processes in parallel
+        **kwargs: optional parameters to be passed to the architecture
 
     Returns:
+    -------
         Recognition predictor
     """
-
-    return _predictor(arch, pretrained, **kwargs)
+ return _predictor(arch=arch, pretrained=pretrained, symmetric_pad=symmetric_pad, batch_size=batch_size, **kwargs)
@@ -367,8 +414,8 @@

Source code for doctr.models.recognition.zoo

   
-
- +
+ diff --git a/v0.3.1/_modules/doctr/models/zoo.html b/v0.3.1/_modules/doctr/models/zoo.html index dec6857019..d459671648 100644 --- a/v0.3.1/_modules/doctr/models/zoo.html +++ b/v0.3.1/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.models.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from typing import Any
-from .core import OCRPredictor
+
 from .detection.zoo import detection_predictor
+from .kie_predictor import KIEPredictor
+from .predictor import OCRPredictor
 from .recognition.zoo import recognition_predictor
 
+__all__ = ["ocr_predictor", "kie_predictor"]
 
-__all__ = ["ocr_predictor"]
-
-
-def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
 
+def _predictor(
+    det_arch: Any,
+    reco_arch: Any,
+    pretrained: bool,
+    pretrained_backbone: bool = True,
+    assume_straight_pages: bool = True,
+    preserve_aspect_ratio: bool = True,
+    symmetric_pad: bool = True,
+    det_bs: int = 2,
+    reco_bs: int = 128,
+    detect_orientation: bool = False,
+    straighten_pages: bool = False,
+    detect_language: bool = False,
+    **kwargs,
+) -> OCRPredictor:
     # Detection
-    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
+    det_predictor = detection_predictor(
+        det_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=det_bs,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+    )
 
     # Recognition
-    reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
+    reco_predictor = recognition_predictor(
+        reco_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=reco_bs,
+    )
 
-    return OCRPredictor(det_predictor, reco_predictor)
+    return OCRPredictor(
+        det_predictor,
+        reco_predictor,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+        detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
+        detect_language=detect_language,
+        **kwargs,
+    )
 
 
 
-[docs] +[docs] def ocr_predictor( - det_arch: str = 'db_resnet50', - reco_arch: str = 'crnn_vgg16_bn', + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", pretrained: bool = False, - **kwargs: Any + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. - Example:: - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet') + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` Returns: + ------- OCR predictor """ + return _predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
+ + - return _predictor(det_arch, reco_arch, pretrained, **kwargs)
+def _kie_predictor( + det_arch: Any, + reco_arch: Any, + pretrained: bool, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + det_bs: int = 2, + reco_bs: int = 128, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs, +) -> KIEPredictor: + # Detection + det_predictor = detection_predictor( + det_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=det_bs, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + ) + + # Recognition + reco_predictor = recognition_predictor( + reco_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=reco_bs, + ) + + return KIEPredictor( + det_predictor, + reco_predictor, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + ) + + +
+[docs] +def kie_predictor( + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, +) -> KIEPredictor: + """End-to-end KIE architecture using one model for localization, and another for text recognition. + + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) + + Args: + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') + pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` + + Returns: + ------- + KIE predictor + """ + return _kie_predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
@@ -353,8 +575,8 @@

Source code for doctr.models.zoo

       
     
   
- - + + diff --git a/v0.3.1/_modules/doctr/transforms/modules.html b/v0.3.1/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.3.1/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/transforms/modules/base.html b/v0.3.1/_modules/doctr/transforms/modules/base.html index c42079a8fd..4596df3848 100644 --- a/v0.3.1/_modules/doctr/transforms/modules/base.html +++ b/v0.3.1/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.base

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
+import math
 import random
-from typing import List, Any, Callable
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import numpy as np
 
 from doctr.utils.repr import NestedObject
+
 from .. import functional as F
 
+__all__ = ["SampleCompose", "ImageTransform", "ColorInversion", "OneOf", "RandomApply", "RandomRotate", "RandomCrop"]
+
+
+class SampleCompose(NestedObject):
+    """Implements a wrapper that will apply transformations sequentially on both image and target
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfo = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), np.zeros((2, 4)))
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import torch
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfos = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfos(torch.rand(8, 64, 64, 3), np.zeros((2, 4)))
+
+    Args:
+    ----
+        transforms: list of transformation modules
+    """
+
+    _children_names: List[str] = ["sample_transforms"]
+
+    def __init__(self, transforms: List[Callable[[Any, Any], Tuple[Any, Any]]]) -> None:
+        self.sample_transforms = transforms
+
+    def __call__(self, x: Any, target: Any) -> Tuple[Any, Any]:
+        for t in self.sample_transforms:
+            x, target = t(x, target)
+
+        return x, target
+
+
+class ImageTransform(NestedObject):
+    """Implements a transform wrapper to turn an image-only transformation into an image+target transform
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), None)
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import torch
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(torch.rand(8, 64, 64, 3), None)
+
+    Args:
+    ----
+        transform: the image transformation module to wrap
+    """
+
+    _children_names: List[str] = ["img_transform"]
+
+    def __init__(self, transform: Callable[[Any], Any]) -> None:
+        self.img_transform = transform
 
-__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
+    def __call__(self, img: Any, target: Any) -> Tuple[Any, Any]:
+        img = self.img_transform(img)
+        return img, target
 
 
 
-[docs] +[docs] class ColorInversion(NestedObject): """Applies the following tranformation to a tensor (image or batch of images): convert to grayscale, colorize (shift 0-values randomly), and then invert colors - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(torch.rand(8, 64, 64, 3)) Args: + ---- min_val: range [min_val, 1] to colorize RGB pixels """ + def __init__(self, min_val: float = 0.5) -> None: self.min_val = min_val @@ -316,59 +437,178 @@

Source code for doctr.transforms.modules.base

-[docs] +[docs] class OneOf(NestedObject): """Randomly apply one of the input transformations - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transforms: list of transformations, one only will be picked """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: # Pick transformation transfo = self.transforms[int(random.random() * len(self.transforms))] # Apply - return transfo(img)
+ return transfo(img) if target is None else transfo(img, target) # type: ignore[call-arg]
-[docs] +[docs] class RandomApply(NestedObject): """Apply with a probability p the input transformation - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transform: transformation to apply p: probability to apply """ - def __init__(self, transform: Callable[[Any], Any], p: float = .5) -> None: + + def __init__(self, transform: Callable[[Any], Any], p: float = 0.5) -> None: self.transform = transform self.p = p def extra_repr(self) -> str: return f"transform={self.transform}, p={self.p}" - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: if random.random() < self.p: - return self.transform(img) - return img
+ return self.transform(img) if target is None else self.transform(img, target) # type: ignore[call-arg] + return img if target is None else (img, target)
+ + + +
+[docs] +class RandomRotate(NestedObject): + """Randomly rotate a tensor image and its boxes + + .. image:: https://doctr-static.mindee.com/models?id=v0.4.0/rotation_illustration.png&src=0 + :align: center + + Args: + ---- + max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in + [-max_angle, max_angle] + expand: whether the image should be padded before the rotation + """ + + def __init__(self, max_angle: float = 5.0, expand: bool = False) -> None: + self.max_angle = max_angle + self.expand = expand + + def extra_repr(self) -> str: + return f"max_angle={self.max_angle}, expand={self.expand}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + angle = random.uniform(-self.max_angle, self.max_angle) + r_img, r_polys = F.rotate_sample(img, target, angle, self.expand) + # Removes deleted boxes + is_kept = (r_polys.max(1) > r_polys.min(1)).sum(1) == 2 + return r_img, r_polys[is_kept]
+ + + +
+[docs] +class RandomCrop(NestedObject): + """Randomly crop a tensor image and its boxes + + Args: + ---- + scale: tuple of floats, relative (min_area, max_area) of the crop + ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w + """ + + def __init__(self, scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: + self.scale = scale + self.ratio = ratio + + def extra_repr(self) -> str: + return f"scale={self.scale}, ratio={self.ratio}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + scale = random.uniform(self.scale[0], self.scale[1]) + ratio = random.uniform(self.ratio[0], self.ratio[1]) + + height, width = img.shape[:2] + + # Calculate crop size + crop_area = scale * width * height + aspect_ratio = ratio * (width / height) + crop_width = int(round(math.sqrt(crop_area * aspect_ratio))) + crop_height = int(round(math.sqrt(crop_area / aspect_ratio))) + + # Ensure crop size does not exceed image dimensions + crop_width = min(crop_width, width) + crop_height = min(crop_height, height) + + # Randomly select crop position + x = random.randint(0, width - crop_width) + y = random.randint(0, height - crop_height) + + # relative crop box + crop_box = (x / width, y / height, (x + crop_width) / width, (y + crop_height) / height) + if target.shape[1:] == (4, 2): + min_xy = np.min(target, axis=1) + max_xy = np.max(target, axis=1) + _target = np.concatenate((min_xy, max_xy), axis=1) + else: + _target = target + + # Crop image and targets + croped_img, crop_boxes = F.crop_detection(img, _target, crop_box) + # hard fallback if no box is kept + if crop_boxes.shape[0] == 0: + return img, target + # clip boxes + return croped_img, np.clip(crop_boxes, 0, 1)
@@ -402,8 +642,8 @@

Source code for doctr.transforms.modules.base

- - + + diff --git a/v0.3.1/_modules/doctr/transforms/modules/tensorflow.html b/v0.3.1/_modules/doctr/transforms/modules/tensorflow.html index 1d192a876b..acbbe96225 100644 --- a/v0.3.1/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.3.1/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import random
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
 import tensorflow as tf
-from typing import List, Any, Tuple, Callable
 
 from doctr.utils.repr import NestedObject
 
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'RandomBrightness',
-           'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality']
+from ..functional.tensorflow import _gaussian_filter, random_shadow
+
+__all__ = [
+    "Compose",
+    "Resize",
+    "Normalize",
+    "LambdaTransformation",
+    "ToGray",
+    "RandomBrightness",
+    "RandomContrast",
+    "RandomSaturation",
+    "RandomHue",
+    "RandomGamma",
+    "RandomJpegQuality",
+    "GaussianBlur",
+    "ChannelShuffle",
+    "GaussianNoise",
+    "RandomHorizontalFlip",
+    "RandomShadow",
+    "RandomResize",
+]
 
 
 
-[docs] +[docs] class Compose(NestedObject): """Implements a wrapper that will apply transformations sequentially - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Compose, Resize + >>> transfos = Compose([Resize((32, 32))]) + >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- transforms: list of transformation modules """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms @@ -319,26 +361,27 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class Resize(NestedObject): """Resizes a tensor to a target size - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Resize + >>> transfo = Resize((32, 32)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- output_size: expected output size method: interpolation method preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically """ + def __init__( self, - output_size: Tuple[int, int], - method: str = 'bilinear', + output_size: Union[int, Tuple[int, int]], + method: str = "bilinear", preserve_aspect_ratio: bool = False, symmetric_pad: bool = False, ) -> None: @@ -346,6 +389,14 @@

Source code for doctr.transforms.modules.tensorflow

self.method = method self.preserve_aspect_ratio = preserve_aspect_ratio self.symmetric_pad = symmetric_pad + self.antialias = True + + if isinstance(self.output_size, int): + self.wanted_size = (self.output_size, self.output_size) + elif isinstance(self.output_size, (tuple, list)): + self.wanted_size = self.output_size + else: + raise AssertionError("Output size should be either a list, a tuple or an int") def extra_repr(self) -> str: _repr = f"output_size={self.output_size}, method='{self.method}'" @@ -353,64 +404,106 @@

Source code for doctr.transforms.modules.tensorflow

_repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" return _repr - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) + def __call__( + self, + img: tf.Tensor, + target: Optional[np.ndarray] = None, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: + input_dtype = img.dtype + self.output_size = ( + (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size + ) + + img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias) + # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio + raw_shape = img.shape[:2] + if self.symmetric_pad: + half_pad = (int((self.output_size[0] - img.shape[0]) / 2), 0) if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
+ if isinstance(self.output_size, (tuple, list)): + # In that case we need to pad because we want to enforce both width and height + if not self.symmetric_pad: + half_pad = (0, 0) + elif self.output_size[0] == img.shape[0]: + half_pad = (0, int((self.output_size[1] - img.shape[1]) / 2)) + # Pad image + img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size) + + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) + if target is not None: + if self.symmetric_pad: + offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1] + + if self.preserve_aspect_ratio: + # Get absolute coords + if target.shape[1:] == (4,): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1] + target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0] + else: + target[:, [0, 2]] *= raw_shape[1] / img.shape[1] + target[:, [1, 3]] *= raw_shape[0] / img.shape[0] + elif target.shape[1:] == (4, 2): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1] + target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0] + else: + target[..., 0] *= raw_shape[1] / img.shape[1] + target[..., 1] *= raw_shape[0] / img.shape[0] + else: + raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") + + return tf.cast(img, dtype=input_dtype), np.clip(target, 0, 1) + + return tf.cast(img, dtype=input_dtype)
-[docs] +[docs] class Normalize(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Normalize + >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- mean: average value per channel std: standard deviation per channel """ + def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) + self.mean = tf.constant(mean) + self.std = tf.constant(std) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std + img -= tf.cast(self.mean, dtype=img.dtype) + img /= tf.cast(self.std, dtype=img.dtype) return img
-[docs] +[docs] class LambdaTransformation(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import LambdaTransformation + >>> transfo = LambdaTransformation(lambda x: x/ 255.) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- fn: the function to be applied to the input tensor """ + def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: self.fn = fn @@ -420,37 +513,42 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class ToGray(NestedObject): """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import ToGray + >>> transfo = ToGray() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) """ + + def __init__(self, num_output_channels: int = 1): + self.num_output_channels = num_output_channels + def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
+ img = tf.image.rgb_to_grayscale(img) + return img if self.num_output_channels == 1 else tf.repeat(img, self.num_output_channels, axis=-1)
-[docs] +[docs] class RandomBrightness(NestedObject): """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta to all pixels - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomBrightness + >>> transfo = RandomBrightness() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] p: probability to apply transformation """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -463,21 +561,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomContrast(NestedObject): """Randomly adjust contrast of a tensor (batch of images or image) by adjusting each pixel: (img - mean) * contrast_factor + mean. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomContrast + >>> transfo = RandomContrast() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) """ - def __init__(self, delta: float = .3) -> None: + + def __init__(self, delta: float = 0.3) -> None: self.delta = delta def extra_repr(self) -> str: @@ -489,21 +588,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomSaturation(NestedObject): """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and increasing saturation by a factor. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomSaturation + >>> transfo = RandomSaturation() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) """ - def __init__(self, delta: float = .5) -> None: + + def __init__(self, delta: float = 0.5) -> None: self.delta = delta def extra_repr(self) -> str: @@ -515,19 +615,20 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomHue(NestedObject): """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHue + >>> transfo = RandomHue() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -540,22 +641,23 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomGamma(NestedObject): """randomly performs gamma correction for a tensor (batch of images or image) - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomGamma + >>> transfo = RandomGamma() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- min_gamma: non-negative real number, lower bound for gamma param max_gamma: non-negative real number, upper bound for gamma min_gain: lower bound for constant multiplier max_gain: upper bound for constant multiplier """ + def __init__( self, min_gamma: float = 0.5, @@ -580,20 +682,21 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomJpegQuality(NestedObject): """Randomly adjust jpeg quality of a 3 dimensional RGB image - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomJpegQuality + >>> transfo = RandomJpegQuality() + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- min_quality: int between [0, 100] max_quality: int between [0, 100] """ + def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: self.min_quality = min_quality self.max_quality = max_quality @@ -602,10 +705,224 @@

Source code for doctr.transforms.modules.tensorflow

return f"min_quality={self.min_quality}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality + return tf.image.random_jpeg_quality(img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality)
+ + + +
+[docs] +class GaussianBlur(NestedObject): + """Randomly adjust jpeg quality of a 3 dimensional RGB image + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianBlur + >>> transfo = GaussianBlur(3, (.1, 5)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + kernel_shape: size of the blurring kernel + std: min and max value of the standard deviation + """ + + def __init__(self, kernel_shape: Union[int, Iterable[int]], std: Tuple[float, float]) -> None: + self.kernel_shape = kernel_shape + self.std = std + + def extra_repr(self) -> str: + return f"kernel_shape={self.kernel_shape}, std={self.std}" + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.squeeze( + _gaussian_filter( + img[tf.newaxis, ...], + kernel_size=self.kernel_shape, + sigma=random.uniform(self.std[0], self.std[1]), + mode="REFLECT", + ), + axis=0, )
+ + +
+[docs] +class ChannelShuffle(NestedObject): + """Randomly shuffle channel order of a given image""" + + def __init__(self): + pass + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.transpose(tf.random.shuffle(tf.transpose(img, perm=[2, 0, 1])), perm=[1, 2, 0])
+ + + +
+[docs] +class GaussianNoise(NestedObject): + """Adds Gaussian Noise to the input tensor + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianNoise + >>> transfo = GaussianNoise(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + mean : mean of the gaussian distribution + std : std of the gaussian distribution + """ + + def __init__(self, mean: float = 0.0, std: float = 1.0) -> None: + super().__init__() + self.std = std + self.mean = mean + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + noise = self.mean + 2 * self.std * tf.random.uniform(x.shape) - self.std + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value(tf.math.round(tf.cast(x, dtype=tf.float32) + 255 * noise), 0, 255), dtype=tf.uint8 + ) + else: + return tf.cast(tf.clip_by_value(x + noise, 0, 1), dtype=x.dtype) + + def extra_repr(self) -> str: + return f"mean={self.mean}, std={self.std}"
+ + + +
+[docs] +class RandomHorizontalFlip(NestedObject): + """Adds random horizontal flip to the input tensor/np.ndarray + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHorizontalFlip + >>> transfo = RandomHorizontalFlip(p=0.5) + >>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1) + >>> target = np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32) + >>> out = transfo(image, target) + + Args: + ---- + p : probability of Horizontal Flip + """ + + def __init__(self, p: float) -> None: + super().__init__() + self.p = p + + def __call__(self, img: Union[tf.Tensor, np.ndarray], target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + _img = tf.image.flip_left_right(img) + _target = target.copy() + # Changing the relative bbox coordinates + if target.shape[1:] == (4,): + _target[:, ::2] = 1 - target[:, [2, 0]] + else: + _target[..., 0] = 1 - target[..., 0] + return _img, _target + return img, target
+ + + +
+[docs] +class RandomShadow(NestedObject): + """Adds random shade to the input image + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomShadow + >>> transfo = RandomShadow(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + opacity_range : minimum and maximum opacity of the shade + """ + + def __init__(self, opacity_range: Optional[Tuple[float, float]] = None) -> None: + super().__init__() + self.opacity_range = opacity_range if isinstance(opacity_range, tuple) else (0.2, 0.8) + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value( + tf.math.round(255 * random_shadow(tf.cast(x, dtype=tf.float32) / 255, self.opacity_range)), + 0, + 255, + ), + dtype=tf.uint8, + ) + else: + return tf.clip_by_value(random_shadow(x, self.opacity_range), 0, 1) + + def extra_repr(self) -> str: + return f"opacity_range={self.opacity_range}"
+ + + +
+[docs] +class RandomResize(NestedObject): + """Randomly resize the input image and align corresponding targets + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomResize + >>> transfo = RandomResize((0.3, 0.9), preserve_aspect_ratio=True, symmetric_pad=True, p=0.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + scale_range: range of the resizing factor for width and height (independently) + preserve_aspect_ratio: whether to preserve the aspect ratio of the image, + given a float value, the aspect ratio will be preserved with this probability + symmetric_pad: whether to symmetrically pad the image, + given a float value, the symmetric padding will be applied with this probability + p: probability to apply the transformation + """ + + def __init__( + self, + scale_range: Tuple[float, float] = (0.3, 0.9), + preserve_aspect_ratio: Union[bool, float] = False, + symmetric_pad: Union[bool, float] = False, + p: float = 0.5, + ): + super().__init__() + self.scale_range = scale_range + self.preserve_aspect_ratio = preserve_aspect_ratio + self.symmetric_pad = symmetric_pad + self.p = p + self._resize = Resize + + def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + scale_h = random.uniform(*self.scale_range) + scale_w = random.uniform(*self.scale_range) + new_size = (int(img.shape[-3] * scale_h), int(img.shape[-2] * scale_w)) + + _img, _target = self._resize( + new_size, + preserve_aspect_ratio=self.preserve_aspect_ratio + if isinstance(self.preserve_aspect_ratio, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + symmetric_pad=self.symmetric_pad + if isinstance(self.symmetric_pad, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + )(img, target) + + return _img, _target + return img, target + + def extra_repr(self) -> str: + return f"scale_range={self.scale_range}, preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}, p={self.p}" # noqa: E501
+
@@ -638,8 +955,8 @@

Source code for doctr.transforms.modules.tensorflow

- +
+ diff --git a/v0.3.1/_modules/doctr/utils/metrics.html b/v0.3.1/_modules/doctr/utils/metrics.html index 460c64a385..8a37d5949a 100644 --- a/v0.3.1/_modules/doctr/utils/metrics.html +++ b/v0.3.1/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.metrics

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-import cv2
-from typing import List, Tuple, Dict, Optional
-from unidecode import unidecode
+from anyascii import anyascii
 from scipy.optimize import linear_sum_assignment
-from doctr.utils.geometry import rbbox_to_polygon
+from shapely.geometry import Polygon
 
-__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'rbox_to_mask',
-           'nms', 'LocalizationConfusion', 'OCRMetric']
+__all__ = [
+    "TextMatch",
+    "box_iou",
+    "polygon_iou",
+    "nms",
+    "LocalizationConfusion",
+    "OCRMetric",
+    "DetectionMetric",
+]
 
 
 def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
-    """Perform string comparison with multiple levels of tolerance
+    """Performs string comparison with multiple levels of tolerance
 
     Args:
+    ----
         word1: a string
         word2: another string
 
     Returns:
+    -------
         a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
-            unidecode counterparts and their lower-case unidecode counterparts match
+            anyascii counterparts and their lower-case anyascii counterparts match
     """
-    raw_match = (word1 == word2)
-    caseless_match = (word1.lower() == word2.lower())
-    unidecode_match = (unidecode(word1) == unidecode(word2))
+    raw_match = word1 == word2
+    caseless_match = word1.lower() == word2.lower()
+    anyascii_match = anyascii(word1) == anyascii(word2)
 
     # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
-    unicase_match = (unidecode(word1).lower() == unidecode(word2).lower())
+    unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
 
-    return raw_match, caseless_match, unidecode_match, unicase_match
+    return raw_match, caseless_match, anyascii_match, unicase_match
 
 
 
-[docs] +[docs] class TextMatch: - """Implements text match metric (word-level accuracy) for recognition task. + r"""Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \\forall X, Y \\in \\mathcal{W}^N, - TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) + \forall X, Y \in \mathcal{W}^N, + TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \\forall a, x \\in \\mathcal{W}, - f_a(x) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } x = a \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{W}` is the set of all possible character sequences, + \forall a, x \in \mathcal{W}, + f_a(x) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } x = a \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. - Example:: - >>> from doctr.utils import TextMatch - >>> metric = TextMatch() - >>> metric.update(['Hello', 'world'], ['hello', 'world']) - >>> metric.summary() + >>> from doctr.utils import TextMatch + >>> metric = TextMatch() + >>> metric.update(['Hello', 'world'], ['hello', 'world']) + >>> metric.summary() """ def __init__(self) -> None: self.reset() +
+[docs] def update( self, gt: List[str], @@ -354,29 +386,32 @@

Source code for doctr.utils.metrics

         """Update the state of the metric with new predictions
 
         Args:
+        ----
             gt: list of groung-truth character sequences
-            pred: list of predicted character sequences"""
-
+            pred: list of predicted character sequences
+        """
         if len(gt) != len(pred):
             raise AssertionError("prediction size does not match with ground-truth labels size")
 
         for gt_word, pred_word in zip(gt, pred):
-            _raw, _caseless, _unidecode, _unicase = string_match(gt_word, pred_word)
+            _raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
             self.raw += int(_raw)
             self.caseless += int(_caseless)
-            self.unidecode += int(_unidecode)
+            self.anyascii += int(_anyascii)
             self.unicase += int(_unicase)
 
-        self.total += len(gt)
+        self.total += len(gt)
+
-[docs] +[docs] def summary(self) -> Dict[str, float]: """Computes the aggregated metrics - Returns: - a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode - counterpart and its lower-case unidecode counterpart + Returns + ------- + a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii + counterpart and its lower-case anyascii counterpart """ if self.total == 0: raise AssertionError("you need to update the metric before getting the summary") @@ -384,7 +419,7 @@

Source code for doctr.utils.metrics

         return dict(
             raw=self.raw / self.total,
             caseless=self.caseless / self.total,
-            unidecode=self.unidecode / self.total,
+            anyascii=self.anyascii / self.total,
             unicase=self.unicase / self.total,
         )
@@ -392,23 +427,25 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.raw = 0
         self.caseless = 0
-        self.unidecode = 0
+        self.anyascii = 0
         self.unicase = 0
         self.total = 0
def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray: - """Compute the IoU between two sets of bounding boxes + """Computes the IoU between two sets of bounding boxes Args: + ---- boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax) boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax) + Returns: + ------- the IoU matrix of shape (N, M) """ - - iou_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) + iou_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0: l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1) @@ -419,107 +456,54 @@

Source code for doctr.utils.metrics

         right = np.minimum(r1, r2.T)
         bot = np.minimum(b1, b2.T)
 
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
+        intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
         union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
         iou_mat = intersection / union
 
     return iou_mat
 
 
-def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
-    """Compute the IoA (intersection over area) between two sets of bounding boxes:
-    ioa(i, j) = inter(i, j) / area(i)
-
-    Args:
-        boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
-        boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
-    Returns:
-        the IoA matrix of shape (N, M)
-    """
-
-    ioa_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
-
-    if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
-        l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
-        l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
-
-        left = np.maximum(l1, l2.T)
-        top = np.maximum(t1, t2.T)
-        right = np.minimum(r1, r2.T)
-        bot = np.minimum(b1, b2.T)
-
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
-        area = (r1 - l1) * (b1 - t1)
-        ioa_mat = intersection / area
-
-    return ioa_mat
-
-
-def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
-    """Compute the IoU between two sets of boolean masks
+def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
+    """Computes the IoU between two sets of rotated bounding boxes
 
     Args:
-        masks_1: boolean masks of shape (N, H, W)
-        masks_2: boolean masks of shape (M, H, W)
+    ----
+        polys_1: rotated bounding boxes of shape (N, 4, 2)
+        polys_2: rotated bounding boxes of shape (M, 4, 2)
+        mask_shape: spatial shape of the intermediate masks
+        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
 
     Returns:
+    -------
         the IoU matrix of shape (N, M)
     """
+    if polys_1.ndim != 3 or polys_2.ndim != 3:
+        raise AssertionError("expects boxes to be in format (N, 4, 2)")
 
-    if masks_1.shape[1:] != masks_2.shape[1:]:
-        raise AssertionError("both boolean masks should have the same spatial shape")
+    iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
 
-    iou_mat = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
+    shapely_polys_1 = [Polygon(poly) for poly in polys_1]
+    shapely_polys_2 = [Polygon(poly) for poly in polys_2]
 
-    if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
-        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...])
-        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...])
-        axes = tuple(range(2, masks_1.ndim + 1))
-        iou_mat = intersection.sum(axis=axes) / union.sum(axis=axes)
+    for i, poly1 in enumerate(shapely_polys_1):
+        for j, poly2 in enumerate(shapely_polys_2):
+            intersection_area = poly1.intersection(poly2).area
+            union_area = poly1.area + poly2.area - intersection_area
+            iou_mat[i, j] = intersection_area / union_area
 
     return iou_mat
 
 
-def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
-    """Convert boxes to masks
-
-    Args:
-        boxes: rotated bounding boxes of shape (N, 5) in format (x, y, w, h, alpha)
-        shape: spatial shapes of the output masks
-
-    Returns:
-        the boolean masks of shape (N, H, W)
-    """
-
-    masks = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
-
-    if boxes.shape[0] > 0:
-        # Get absolute coordinates
-        if boxes.dtype != np.int:
-            abs_boxes = boxes.copy()
-            abs_boxes[:, [0, 2]] = abs_boxes[:, [0, 2]] * shape[1]
-            abs_boxes[:, [1, 3]] = abs_boxes[:, [1, 3]] * shape[0]
-            abs_boxes = abs_boxes.round().astype(np.int)
-        else:
-            abs_boxes = boxes
-            abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
-
-        # TODO: optimize slicing to improve vectorization
-        for idx, _box in enumerate(abs_boxes):
-            box = rbbox_to_polygon(_box)
-            cv2.fillPoly(masks[idx], [np.array(box, np.int32)], 1)
-
-    return masks.astype(bool)
-
-
-def nms(boxes: np.ndarray, thresh: float = .5) -> List[int]:
+def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
     """Perform non-max suppression, borrowed from <https://github.com/rbgirshick/fast-rcnn>`_.
 
     Args:
+    ----
         boxes: np array of straight boxes: (*, 5), (xmin, ymin, xmax, ymax, score)
         thresh: iou threshold to perform box suppression.
 
     Returns:
+    -------
         A list of box indexes to keep
     """
     x1 = boxes[:, 0]
@@ -551,66 +535,71 @@ 

Source code for doctr.utils.metrics

 
 
 
-[docs] +[docs] class LocalizationConfusion: - """Implements common confusion metrics and mean IoU for localization evaluation. + r"""Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ - Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) + \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ + Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ + Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ + meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \\forall y \\in \\mathcal{B}, - g_X(y) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, + \forall y \in \mathcal{B}, + g_X(y) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import LocalizationConfusion - >>> metric = LocalizationConfusion(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import LocalizationConfusion + >>> metric = LocalizationConfusion(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: + """Updates the metric + Args: + ---- + gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + """ if preds.shape[0] > 0: # Compute IoU - if self.rotated_bbox: - mask_gts = rbox_to_mask(gts, shape=self.mask_shape) - mask_preds = rbox_to_mask(preds, shape=self.mask_shape) - iou_mat = mask_iou(mask_gts, mask_preds) + if self.use_polygons: + iou_mat = polygon_iou(gts, preds) else: iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=1).sum()) + self.tot_iou += float(iou_mat.max(axis=0).sum()) # Assign pairs gt_indices, pred_indices = linear_sum_assignment(-iou_mat) @@ -618,17 +607,18 @@

Source code for doctr.utils.metrics

 
         # Update counts
         self.num_gts += gts.shape[0]
-        self.num_preds += preds.shape[0]
+        self.num_preds += preds.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: """Computes the aggregated metrics - Returns: + Returns + ------- a tuple with the recall, precision and meanIoU scores """ - # Recall recall = self.matches / self.num_gts if self.num_gts > 0 else None @@ -636,7 +626,7 @@

Source code for doctr.utils.metrics

         precision = self.matches / self.num_preds if self.num_preds > 0 else None
 
         # mean IoU
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -645,64 +635,65 @@

Source code for doctr.utils.metrics

         self.num_gts = 0
         self.num_preds = 0
         self.matches = 0
-        self.tot_iou = 0.
+ self.tot_iou = 0.0
-[docs] +[docs] class OCRMetric: - """Implements end-to-end OCR metric. + r"""Implements an end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, - \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ - Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) + \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, + \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ + Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, - h_{B,L}(b, l) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ - & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, - :math:`\\mathcal{L}` is the set of possible character sequences, + \forall (b, l) \in \mathcal{B} \times \mathcal{L}, + h_{B,L}(b, l) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import OCRMetric - >>> metric = OCRMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - ['hello'], ['hello', 'world']) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import OCRMetric + >>> metric = OCRMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> ['hello'], ['hello', 'world']) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update( self, gt_boxes: np.ndarray, @@ -710,50 +701,58 @@

Source code for doctr.utils.metrics

         gt_labels: List[str],
         pred_labels: List[str],
     ) -> None:
+        """Updates the metric
 
+        Args:
+        ----
+            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
+            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
+            gt_labels: a list of N string labels
+            pred_labels: a list of M string labels
+        """
         if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
-            raise AssertionError("there should be the same number of boxes and string both for the ground truth "
-                                 "and the predictions")
+            raise AssertionError(
+                "there should be the same number of boxes and string both for the ground truth and the predictions"
+            )
 
         # Compute IoU
         if pred_boxes.shape[0] > 0:
-            if self.rotated_bbox:
-                mask_gts = rbox_to_mask(gt_boxes, shape=self.mask_shape)
-                mask_preds = rbox_to_mask(pred_boxes, shape=self.mask_shape)
-                iou_mat = mask_iou(mask_gts, mask_preds)
+            if self.use_polygons:
+                iou_mat = polygon_iou(gt_boxes, pred_boxes)
             else:
                 iou_mat = box_iou(gt_boxes, pred_boxes)
 
-            self.tot_iou += float(iou_mat.max(axis=1).sum())
+            self.tot_iou += float(iou_mat.max(axis=0).sum())
 
             # Assign pairs
             gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
             is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
             # String comparison
             for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
-                _raw, _caseless, _unidecode, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
+                _raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
                 self.raw_matches += int(_raw)
                 self.caseless_matches += int(_caseless)
-                self.unidecode_matches += int(_unidecode)
+                self.anyascii_matches += int(_anyascii)
                 self.unicase_matches += int(_unicase)
 
         self.num_gts += gt_boxes.shape[0]
-        self.num_preds += pred_boxes.shape[0]
+        self.num_preds += pred_boxes.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float]], Optional[float]]: """Computes the aggregated metrics - Returns: - a tuple with the recall & precision for each string comparison flexibility and the mean IoU + Returns + ------- + a tuple with the recall & precision for each string comparison and the mean IoU """ - # Recall recall = dict( raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None, caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None, - unidecode=self.unidecode_matches / self.num_gts if self.num_gts > 0 else None, + anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None, unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None, ) @@ -761,12 +760,12 @@

Source code for doctr.utils.metrics

         precision = dict(
             raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
             caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
-            unidecode=self.unidecode_matches / self.num_preds if self.num_preds > 0 else None,
+            anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
             unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
         )
 
         # mean IoU (overall detected boxes)
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -774,12 +773,136 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.num_gts = 0
         self.num_preds = 0
-        self.tot_iou = 0.
+        self.tot_iou = 0.0
         self.raw_matches = 0
         self.caseless_matches = 0
-        self.unidecode_matches = 0
+        self.anyascii_matches = 0
         self.unicase_matches = 0
+ + +
+[docs] +class DetectionMetric: + r"""Implements an object detection metric. + + The aggregated metrics are computed as follows: + + .. math:: + \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, + \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ + Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + + with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and + :math:`y`, and the function :math:`h_{B, C}` defined as: + + .. math:: + \forall (b, c) \in \mathcal{B} \times \mathcal{C}, + h_{B,C}(b, c) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{C}` is the set of possible class indices, + :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. + + >>> import numpy as np + >>> from doctr.utils import DetectionMetric + >>> metric = DetectionMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) + >>> metric.summary() + + Args: + ---- + iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format + """ + + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: + self.iou_thresh = iou_thresh + self.use_polygons = use_polygons + self.reset() + +
+[docs] + def update( + self, + gt_boxes: np.ndarray, + pred_boxes: np.ndarray, + gt_labels: np.ndarray, + pred_labels: np.ndarray, + ) -> None: + """Updates the metric + + Args: + ---- + gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + gt_labels: an array of class indices of shape (N,) + pred_labels: an array of class indices of shape (M,) + """ + if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: + raise AssertionError( + "there should be the same number of boxes and string both for the ground truth and the predictions" + ) + + # Compute IoU + if pred_boxes.shape[0] > 0: + if self.use_polygons: + iou_mat = polygon_iou(gt_boxes, pred_boxes) + else: + iou_mat = box_iou(gt_boxes, pred_boxes) + + self.tot_iou += float(iou_mat.max(axis=0).sum()) + + # Assign pairs + gt_indices, pred_indices = linear_sum_assignment(-iou_mat) + is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh + # Category comparison + self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) + + self.num_gts += gt_boxes.shape[0] + self.num_preds += pred_boxes.shape[0]
+ + +
+[docs] + def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: + """Computes the aggregated metrics + + Returns + ------- + a tuple with the recall & precision for each class prediction and the mean IoU + """ + # Recall + recall = self.num_matches / self.num_gts if self.num_gts > 0 else None + + # Precision + precision = self.num_matches / self.num_preds if self.num_preds > 0 else None + + # mean IoU (overall detected boxes) + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None + + return recall, precision, mean_iou
+ + + def reset(self) -> None: + self.num_gts = 0 + self.num_preds = 0 + self.tot_iou = 0.0 + self.num_matches = 0
+
@@ -812,8 +935,8 @@

Source code for doctr.utils.metrics

       
     
   
- - + + diff --git a/v0.3.1/_modules/doctr/utils/visualization.html b/v0.3.1/_modules/doctr/utils/visualization.html index 8e7dcca811..c818be6d7b 100644 --- a/v0.3.1/_modules/doctr/utils/visualization.html +++ b/v0.3.1/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.visualization

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import colorsys
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import matplotlib.pyplot as plt
-from matplotlib.figure import Figure
+import cv2
 import matplotlib.patches as patches
-import mplcursors
-from PIL import ImageFont, ImageDraw, Image
+import matplotlib.pyplot as plt
 import numpy as np
-import cv2
-from typing import Tuple, List, Dict, Any, Union
+from matplotlib.figure import Figure
 
-from .common_types import BoundingBox, RotatedBbox
+from .common_types import BoundingBox, Polygon4P
 
-__all__ = ['visualize_page', 'synthetize_page']
+__all__ = ["visualize_page", "visualize_kie_page", "draw_boxes"]
 
 
-def create_rect_patch(
-    geometry: Union[BoundingBox, RotatedBbox],
-    label: str,
+def rect_patch(
+    geometry: BoundingBox,
     page_dimensions: Tuple[int, int],
-    color: Tuple[int, int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
     alpha: float = 0.3,
     linewidth: int = 2,
     fill: bool = True,
-) -> patches.Patch:
-    """Create a matplotlib patch (rectangle) bounding the element
+    preserve_aspect_ratio: bool = False,
+) -> patches.Rectangle:
+    """Create a matplotlib rectangular patch for the element
 
     Args:
+    ----
         geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
         label: label to display when hovered
-        page_dimensions: dimensions of the Page
         color: color to draw box
         alpha: opacity parameter to fill the boxes, 0 = transparent
         linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
 
     Returns:
+    -------
         a rectangular Patch
     """
+    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
     height, width = page_dimensions
-    if len(geometry) == 5:
-        x, y, w, h, a = geometry  # type: ignore[misc]
-        x, w = x * width, w * width
-        y, h = y * height, h * height
-        points = cv2.boxPoints(((x, y), (w, h), a))
-        return patches.Polygon(
-            points,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
-    else:
-        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
-        xmin, xmax = xmin * width, xmax * width
-        ymin, ymax = ymin * height, ymax * height
-        return patches.Rectangle(
-            (xmin, ymin),
-            xmax - xmin,
-            ymax - ymin,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
+    (xmin, ymin), (xmax, ymax) = geometry
+    # Switch to absolute coords
+    if preserve_aspect_ratio:
+        width = height = max(height, width)
+    xmin, w = xmin * width, (xmax - xmin) * width
+    ymin, h = ymin * height, (ymax - ymin) * height
+
+    return patches.Rectangle(
+        (xmin, ymin),
+        w,
+        h,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def polygon_patch(
+    geometry: np.ndarray,
+    page_dimensions: Tuple[int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
+    alpha: float = 0.3,
+    linewidth: int = 2,
+    fill: bool = True,
+    preserve_aspect_ratio: bool = False,
+) -> patches.Polygon:
+    """Create a matplotlib polygon patch for the element
+
+    Args:
+    ----
+        geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
+        label: label to display when hovered
+        color: color to draw box
+        alpha: opacity parameter to fill the boxes, 0 = transparent
+        linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
+
+    Returns:
+    -------
+        a polygon Patch
+    """
+    if not geometry.shape == (4, 2):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
+    height, width = page_dimensions
+    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
+    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
+
+    return patches.Polygon(
+        geometry,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def create_obj_patch(
+    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
+    page_dimensions: Tuple[int, int],
+    **kwargs: Any,
+) -> patches.Patch:
+    """Create a matplotlib patch for the element
+
+    Args:
+    ----
+        geometry: bounding box (straight or rotated) of the element
+        page_dimensions: dimensions of the page in format (height, width)
+        **kwargs: keyword arguments for the patch
+
+    Returns:
+    -------
+        a matplotlib Patch
+    """
+    if isinstance(geometry, tuple):
+        if len(geometry) == 2:  # straight word BB (2 pts)
+            return rect_patch(geometry, page_dimensions, **kwargs)
+        elif len(geometry) == 4:  # rotated word BB (4 pts)
+            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)
+    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
+        return polygon_patch(geometry, page_dimensions, **kwargs)
+    raise ValueError("invalid geometry format")
+
+
+def get_colors(num_colors: int) -> List[Tuple[float, float, float]]:
+    """Generate num_colors color for matplotlib
+
+    Args:
+    ----
+        num_colors: number of colors to generate
+
+    Returns:
+    -------
+        colors: list of generated colors
+    """
+    colors = []
+    for i in np.arange(0.0, 360.0, 360.0 / num_colors):
+        hue = i / 360.0
+        lightness = (50 + np.random.rand() * 10) / 100.0
+        saturation = (90 + np.random.rand() * 10) / 100.0
+        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
+    return colors
 
 
 
-[docs] +[docs] def visualize_page( page: Dict[str, Any], image: np.ndarray, @@ -359,18 +472,18 @@

Source code for doctr.utils.visualization

 ) -> Figure:
     """Visualize a full page with predicted blocks, lines and words
 
-    Example::
-        >>> import numpy as np
-        >>> import matplotlib.pyplot as plt
-        >>> from doctr.utils.visualization import visualize_page
-        >>> from doctr.models import ocr_db_crnn
-        >>> model = ocr_db_crnn(pretrained=True)
-        >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
-        >>> out = model([[input_page]])
-        >>> visualize_page(out[0].pages[0].export(), input_page)
-        >>> plt.show()
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from doctr.utils.visualization import visualize_page
+    >>> from doctr.models import ocr_db_crnn
+    >>> model = ocr_db_crnn(pretrained=True)
+    >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
+    >>> out = model([[input_page]])
+    >>> visualize_page(out[0].pages[0].export(), input_page)
+    >>> plt.show()
 
     Args:
+    ----
         page: the exported Page of a Document
         image: np array of the page, needs to have the same shape than page['dimensions']
         words_only: whether only words should be displayed
@@ -378,6 +491,11 @@ 

Source code for doctr.utils.visualization

         scale: figsize of the largest windows side
         interactive: whether the plot should be interactive
         add_labels: for static plot, adds text labels on top of bounding box
+        **kwargs: keyword arguments for the polygon patch
+
+    Returns:
+    -------
+        the matplotlib figure
     """
     # Get proper scale and aspect ratio
     h, w = image.shape[:2]
@@ -386,128 +504,189 @@ 

Source code for doctr.utils.visualization

     # Display the image
     ax.imshow(image)
     # hide both axis
-    ax.axis('off')
+    ax.axis("off")
 
     if interactive:
         artists: List[patches.Patch] = []  # instantiate an empty list of patches (to be drawn on the page)
 
-    for block in page['blocks']:
+    for block in page["blocks"]:
         if not words_only:
-            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
+            rect = create_obj_patch(
+                block["geometry"], page["dimensions"], label="block", color=(0, 1, 0), linewidth=1, **kwargs
+            )
             # add patch on figure
             ax.add_patch(rect)
             if interactive:
                 # add patch to cursor's artists
                 artists.append(rect)
 
-        for line in block['lines']:
+        for line in block["lines"]:
             if not words_only:
-                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
+                rect = create_obj_patch(
+                    line["geometry"], page["dimensions"], label="line", color=(1, 0, 0), linewidth=1, **kwargs
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
-            for word in line['words']:
-                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
-                                         page['dimensions'], (0, 0, 1), **kwargs)
+            for word in line["words"]:
+                rect = create_obj_patch(
+                    word["geometry"],
+                    page["dimensions"],
+                    label=f"{word['value']} (confidence: {word['confidence']:.2%})",
+                    color=(0, 0, 1),
+                    **kwargs,
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
                 elif add_labels:
-                    if len(word['geometry']) == 5:
+                    if len(word["geometry"]) == 5:
                         text_loc = (
-                            int(page['dimensions'][1] * (word['geometry'][0] - word['geometry'][2] / 2)),
-                            int(page['dimensions'][0] * (word['geometry'][1] - word['geometry'][3] / 2))
+                            int(page["dimensions"][1] * (word["geometry"][0] - word["geometry"][2] / 2)),
+                            int(page["dimensions"][0] * (word["geometry"][1] - word["geometry"][3] / 2)),
                         )
                     else:
                         text_loc = (
-                            int(page['dimensions'][1] * word['geometry'][0][0]),
-                            int(page['dimensions'][0] * word['geometry'][0][1])
+                            int(page["dimensions"][1] * word["geometry"][0][0]),
+                            int(page["dimensions"][0] * word["geometry"][0][1]),
+                        )
+
+                    if len(word["geometry"]) == 2:
+                        # We draw only if boxes are in straight format
+                        ax.text(
+                            *text_loc,
+                            word["value"],
+                            size=10,
+                            alpha=0.5,
+                            color=(0, 0, 1),
                         )
-                    ax.text(
-                        *text_loc,
-                        word['value'],
-                        size=10,
-                        alpha=0.5,
-                        color=(0, 0, 1),
-                    )
 
         if display_artefacts:
-            for artefact in block['artefacts']:
-                rect = create_rect_patch(
-                    artefact['geometry'],
-                    'artefact',
-                    page['dimensions'],
-                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
+            for artefact in block["artefacts"]:
+                rect = create_obj_patch(
+                    artefact["geometry"],
+                    page["dimensions"],
+                    label="artefact",
+                    color=(0.5, 0.5, 0.5),
                     linewidth=1,
-                    **kwargs
+                    **kwargs,
                 )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
     if interactive:
+        import mplcursors
+
         # Create mlp Cursor to hover patches in artists
         mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
-    fig.tight_layout(pad=0.)
+    fig.tight_layout(pad=0.0)
 
     return fig
-def synthetize_page( +def visualize_kie_page( page: Dict[str, Any], - draw_proba: bool = False, - font_size: int = 13, -) -> np.ndarray: - """Draw a the content of the element page (OCR response) on a blank page. + image: np.ndarray, + words_only: bool = False, + display_artefacts: bool = True, + scale: float = 10, + interactive: bool = True, + add_labels: bool = True, + **kwargs: Any, +) -> Figure: + """Visualize a full page with predicted blocks, lines and words + + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from doctr.utils.visualization import visualize_page + >>> from doctr.models import ocr_db_crnn + >>> model = ocr_db_crnn(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([[input_page]]) + >>> visualize_kie_page(out[0].pages[0].export(), input_page) + >>> plt.show() Args: - page: exported Page object to represent - draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 - font_size: size of the font, default font = 13 + ---- + page: the exported Page of a Document + image: np array of the page, needs to have the same shape than page['dimensions'] + words_only: whether only words should be displayed + display_artefacts: whether artefacts should be displayed + scale: figsize of the largest windows side + interactive: whether the plot should be interactive + add_labels: for static plot, adds text labels on top of bounding box + **kwargs: keyword arguments for the polygon patch - Return: - A np array (drawn page) + Returns: + ------- + the matplotlib figure """ - # Draw template - h, w = page["dimensions"] - response = 255 * np.ones((h, w, 3), dtype=np.int32) + # Get proper scale and aspect ratio + h, w = image.shape[:2] + size = (scale * w / h, scale) if h > w else (scale, h / w * scale) + fig, ax = plt.subplots(figsize=size) + # Display the image + ax.imshow(image) + # hide both axis + ax.axis("off") - # Draw each word - for block in page["blocks"]: - for line in block["lines"]: - for word in line["words"]: - # Get aboslute word geometry - (xmin, ymin), (xmax, ymax) = word["geometry"] - xmin, xmax = int(w * xmin), int(w * xmax) - ymin, ymax = int(h * ymin), int(h * ymax) - - # White drawing context adapted to font size, 0.75 factor to convert pts --> pix - h_box, w_box = ymax - ymin, xmax - xmin - h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75)) - img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255)) - d = ImageDraw.Draw(img) - - # Draw in black the value of the word - d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0)) - - # Resize back to box size - img = img.resize((w_box, h_box), Image.NEAREST) - - # Colorize if draw_proba - if draw_proba: - p = int(255 * word["confidence"]) - mask = np.where(np.array(img) == 0, 1, 0) - proba = np.array([255 - p, 0, p]) - color = mask * proba[np.newaxis, np.newaxis, :] - white_mask = 255 * (1 - mask) - img = color + white_mask - - # Write to response page - response[ymin:ymax, xmin:xmax, :] = np.array(img) - - return response + if interactive: + artists: List[patches.Patch] = [] # instantiate an empty list of patches (to be drawn on the page) + + colors = {k: color for color, k in zip(get_colors(len(page["predictions"])), page["predictions"])} + for key, value in page["predictions"].items(): + for prediction in value: + if not words_only: + rect = create_obj_patch( + prediction["geometry"], + page["dimensions"], + label=f"{key} \n {prediction['value']} (confidence: {prediction['confidence']:.2%}", + color=colors[key], + linewidth=1, + **kwargs, + ) + # add patch on figure + ax.add_patch(rect) + if interactive: + # add patch to cursor's artists + artists.append(rect) + + if interactive: + import mplcursors + + # Create mlp Cursor to hover patches in artists + mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label())) + fig.tight_layout(pad=0.0) + + return fig + + +def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None: + """Draw an array of relative straight boxes on an image + + Args: + ---- + boxes: array of relative boxes, of shape (*, 4) + image: np array, float32 or uint8 + color: color to use for bounding box edges + **kwargs: keyword arguments from `matplotlib.pyplot.plot` + """ + h, w = image.shape[:2] + # Convert boxes to absolute coords + _boxes = deepcopy(boxes) + _boxes[:, [0, 2]] *= w + _boxes[:, [1, 3]] *= h + _boxes = _boxes.astype(np.int32) + for box in _boxes.tolist(): + xmin, ymin, xmax, ymax = box + image = cv2.rectangle( + image, (xmin, ymin), (xmax, ymax), color=color if isinstance(color, tuple) else (0, 0, 255), thickness=2 + ) + plt.imshow(image) + plt.plot(**kwargs)
@@ -540,8 +719,8 @@

Source code for doctr.utils.visualization

       
     
   
- - + + diff --git a/v0.3.1/_modules/index.html b/v0.3.1/_modules/index.html index e86abcd4d4..5793c44f20 100644 --- a/v0.3.1/_modules/index.html +++ b/v0.3.1/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -225,20 +225,42 @@ - - + + diff --git a/v0.3.1/_sources/changelog.rst.txt b/v0.3.1/_sources/changelog.rst.txt index 430097d6c8..35befe7b96 100644 --- a/v0.3.1/_sources/changelog.rst.txt +++ b/v0.3.1/_sources/changelog.rst.txt @@ -1,6 +1,54 @@ Changelog ========= +v0.10.0 (2024-10-21) +------------------- +Release note: `v0.10.0 `_ + +v0.9.0 (2024-08-08) +------------------- +Release note: `v0.9.0 `_ + +v0.8.1 (2024-03-04) +------------------- +Release note: `v0.8.1 `_ + +v0.8.0 (2024-02-28) +------------------- +Release note: `v0.8.0 `_ + +v0.7.0 (2023-09-09) +------------------- +Release note: `v0.7.0 `_ + +v0.6.0 (2022-09-29) +------------------- +Release note: `v0.6.0 `_ + +v0.5.1 (2022-03-22) +------------------- +Release note: `v0.5.1 `_ + +v0.5.0 (2021-12-31) +------------------- +Release note: `v0.5.0 `_ + +v0.4.1 (2021-11-22) +------------------- +Release note: `v0.4.1 `_ + +v0.4.0 (2021-10-01) +------------------- +Release note: `v0.4.0 `_ + +v0.3.1 (2021-08-27) +------------------- +Release note: `v0.3.1 `_ + +v0.3.0 (2021-07-02) +------------------- +Release note: `v0.3.0 `_ + v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.3.1/_sources/datasets.rst.txt b/v0.3.1/_sources/datasets.rst.txt deleted file mode 100644 index 354122f1e5..0000000000 --- a/v0.3.1/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.datasets.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.3.1/_sources/documents.rst.txt b/v0.3.1/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.3.1/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.3.1/_sources/getting_started/installing.rst.txt b/v0.3.1/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/v0.3.1/_sources/getting_started/installing.rst.txt +++ b/v0.3.1/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/v0.3.1/_sources/index.rst.txt b/v0.3.1/_sources/index.rst.txt index fc3ff89fdf..53251db142 100644 --- a/v0.3.1/_sources/index.rst.txt +++ b/v0.3.1/_sources/index.rst.txt @@ -1,7 +1,8 @@ -DocTR: Document Text Recognition -================================ +******************************** +docTR: Document Text Recognition +******************************** -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -9,38 +10,29 @@ State-of-the-art Optical Character Recognition made seamless & accessible to any DocTR provides an easy and powerful way to extract valuable information from your documents: -* |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. +* |:receipt:| **for automation**: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. -Welcome to the documentation of `DocTR `_! - - Main Features ------------- * |:robot:| Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor -* |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract +* |:rocket:| State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, small dependencies -* |:tools:| Daily maintained -* |:factory:| Easy integration - +* |:bird:| Light package, minimal dependencies +* |:tools:| Actively maintained by Mindee +* |:factory:| Easy integration (available templates for browser demo & API deployment) -Getting Started ---------------- .. toctree:: :maxdepth: 2 + :caption: Getting started + :hidden: - installing - - -Build & train your predictor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) -* Fine-tune or train from scratch any detection or recognition model to specialize on your data + getting_started/installing + notebooks Model zoo @@ -48,36 +40,83 @@ Model zoo Text detection models """"""""""""""""""""" - * `DBNet `_ (Differentiable Binarization) - * `LinkNet `_ +* DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ +* LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ +* FAST from `"FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation" `_ Text recognition models """"""""""""""""""""""" - * `SAR `_ (Show, Attend and Read) - * `CRNN `_ (Convolutional Recurrent Neural Network) - * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) +* SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ +* CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ +* MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ +* ViTSTR from `"Vision Transformer for Fast and Efficient Scene Text Recognition" `_ +* PARSeq from `"Scene Text Recognition with Permuted Autoregressive Sequence Models" `_ Supported datasets ^^^^^^^^^^^^^^^^^^ - * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. - * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. - * SROIE from `ICDAR 2019 `_. +* FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. +* CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. +* SROIE from `ICDAR 2019 `_. +* IIIT-5k from `CVIT `_. +* Street View Text from `"End-to-End Scene Text Recognition" `_. +* SynthText from `Visual Geometry Group `_. +* SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. +* IC03 from `ICDAR 2003 `_. +* IC13 from `ICDAR 2013 `_. +* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. +* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" `_. +* IIITHWS from `"Generating Synthetic Data for Text Recognition" `_. +* WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" `_. .. toctree:: :maxdepth: 2 - :caption: Notes + :caption: Using docTR + :hidden: - changelog + using_doctr/using_models + using_doctr/using_datasets + using_doctr/using_contrib_modules + using_doctr/sharing_models + using_doctr/using_model_export + using_doctr/custom_models_training + using_doctr/running_on_aws + + +.. toctree:: + :maxdepth: 2 + :caption: Community + :hidden: + + community/resources .. toctree:: :maxdepth: 2 :caption: Package Reference + :hidden: - datasets - documents - models - transforms - utils + modules/contrib + modules/datasets + modules/io + modules/models + modules/transforms + modules/utils + + +.. toctree:: + :maxdepth: 2 + :caption: Contributing + :hidden: + + contributing/code_of_conduct + contributing/contributing + + +.. toctree:: + :maxdepth: 2 + :caption: Notes + :hidden: + + changelog diff --git a/v0.3.1/_sources/installing.rst.txt b/v0.3.1/_sources/installing.rst.txt deleted file mode 100644 index 5c8779dc1c..0000000000 --- a/v0.3.1/_sources/installing.rst.txt +++ /dev/null @@ -1,46 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: - -* TensorFlow: `installation page `_. -* PyTorch: `installation page `_. - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.3.1/_sources/models.rst.txt b/v0.3.1/_sources/models.rst.txt deleted file mode 100644 index 9830c6c153..0000000000 --- a/v0.3.1/_sources/models.rst.txt +++ /dev/null @@ -1,215 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet16 - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 -.. autofunction:: doctr.models.recognition.master - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.3.1/_sources/transforms.rst.txt b/v0.3.1/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.3.1/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.3.1/_sources/utils.rst.txt b/v0.3.1/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.3.1/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.3.1/_static/basic.css b/v0.3.1/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.3.1/_static/basic.css +++ b/v0.3.1/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.3.1/_static/doctools.js b/v0.3.1/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.3.1/_static/doctools.js +++ b/v0.3.1/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.3.1/_static/documentation_options.js b/v0.3.1/_static/documentation_options.js index a7b5cbe04a..4f656fdbea 100644 --- a/v0.3.1/_static/documentation_options.js +++ b/v0.3.1/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.3.0a0-git', + VERSION: '0.10.1a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.3.1/_static/language_data.js b/v0.3.1/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.3.1/_static/language_data.js +++ b/v0.3.1/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.3.1/_static/searchtools.js b/v0.3.1/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.3.1/_static/searchtools.js +++ b/v0.3.1/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.3.1/changelog.html b/v0.3.1/changelog.html index eafac3a877..fc45a50384 100644 --- a/v0.3.1/changelog.html +++ b/v0.3.1/changelog.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + Changelog - docTR documentation @@ -226,20 +226,42 @@ + diff --git a/v0.3.1/community/resources.html b/v0.3.1/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.3.1/community/resources.html +++ b/v0.3.1/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.3.1/contributing/code_of_conduct.html b/v0.3.1/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/v0.3.1/contributing/code_of_conduct.html +++ b/v0.3.1/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/v0.3.1/contributing/contributing.html b/v0.3.1/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/v0.3.1/contributing/contributing.html +++ b/v0.3.1/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/v0.3.1/datasets.html b/v0.3.1/datasets.html deleted file mode 100644 index 193e576c57..0000000000 --- a/v0.3.1/datasets.html +++ /dev/null @@ -1,578 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
  • sos – optional encoding of Start Of String

  • -
  • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/documents.html b/v0.3.1/documents.html deleted file mode 100644 index 98cbb2c5ef..0000000000 --- a/v0.3.1/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/genindex.html b/v0.3.1/genindex.html index a19b433943..21520455b4 100644 --- a/v0.3.1/genindex.html +++ b/v0.3.1/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -224,20 +224,42 @@

+
+

U

+ + +
+
+

V

@@ -561,7 +711,13 @@

V

W

+
@@ -599,8 +755,8 @@

W

- - + + diff --git a/v0.3.1/getting_started/installing.html b/v0.3.1/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/v0.3.1/getting_started/installing.html +++ b/v0.3.1/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/v0.3.1/index.html b/v0.3.1/index.html index 4c6a28c66a..3a06afc6d9 100644 --- a/v0.3.1/index.html +++ b/v0.3.1/index.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + docTR documentation @@ -226,20 +226,42 @@
-

DocTR: Document Text Recognition

-

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta)

+

docTR: Document Text Recognition

+

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch

https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png

DocTR provides an easy and powerful way to extract valuable information from your documents:

    -
  • 🧾 for automation: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • +
  • 🧾 for automation: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • 👩‍🔬 for research: quickly compare your own architectures speed & performances with state-of-art models on public datasets.

-

Welcome to the documentation of DocTR!

Main Features

  • 🤖 Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters

  • ⚡ User-friendly, 3 lines of code to load a document and extract text with a predictor

  • -
  • 🚀 State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract

  • +
  • 🚀 State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract

  • ⚡ Optimized for inference speed on both CPU & GPU

  • -
  • 🐦 Light package, small dependencies

  • -
  • 🛠️ Daily maintained

  • -
  • 🏭 Easy integration

  • +
  • 🐦 Light package, minimal dependencies

  • +
  • 🛠️ Actively maintained by Mindee

  • +
  • 🏭 Easy integration (available templates for browser demo & API deployment)

-
-
-

Getting Started

-
-

Build & train your predictor

-
    -
  • Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained)

  • -
  • Fine-tune or train from scratch any detection or recognition model to specialize on your data

  • -
-

Model zoo

Text detection models

-
-

Text recognition models

-
-

Supported datasets

-
-
+
+
+
+
+
@@ -406,7 +381,7 @@

Supported datasets - +
Next @@ -446,10 +421,8 @@

Supported datasets + diff --git a/v0.3.1/installing.html b/v0.3.1/installing.html deleted file mode 100644 index b61c60134b..0000000000 --- a/v0.3.1/installing.html +++ /dev/null @@ -1,395 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

- -

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/models.html b/v0.3.1/models.html deleted file mode 100644 index b5cd44c9fa..0000000000 --- a/v0.3.1/models.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet16
->>> model = linknet16(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
-

MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. -Example:

-
>>> import tensorflow as tf
->>> from doctr.models import master
->>> model = master(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

59.50

62.50

75.30

70.00

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

66.00

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/modules/contrib.html b/v0.3.1/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.3.1/modules/contrib.html +++ b/v0.3.1/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.3.1/modules/datasets.html b/v0.3.1/modules/datasets.html index 456e10b172..380a986793 100644 --- a/v0.3.1/modules/datasets.html +++ b/v0.3.1/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1081,7 +1081,7 @@

Returns:

- + diff --git a/v0.3.1/modules/io.html b/v0.3.1/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/v0.3.1/modules/io.html +++ b/v0.3.1/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/v0.3.1/modules/models.html b/v0.3.1/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/v0.3.1/modules/models.html +++ b/v0.3.1/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/v0.3.1/modules/transforms.html b/v0.3.1/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/v0.3.1/modules/transforms.html +++ b/v0.3.1/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/v0.3.1/modules/utils.html b/v0.3.1/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/v0.3.1/modules/utils.html +++ b/v0.3.1/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/v0.3.1/notebooks.html b/v0.3.1/notebooks.html index f97771aebb..d36539f59e 100644 --- a/v0.3.1/notebooks.html +++ b/v0.3.1/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/v0.3.1/objects.inv b/v0.3.1/objects.inv index a22d2ce821..c1700f291b 100644 Binary files a/v0.3.1/objects.inv and b/v0.3.1/objects.inv differ diff --git a/v0.3.1/py-modindex.html b/v0.3.1/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.3.1/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.3.1/search.html b/v0.3.1/search.html index 73772822d2..d050f5eac7 100644 --- a/v0.3.1/search.html +++ b/v0.3.1/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -226,20 +226,42 @@ - - + + diff --git a/v0.3.1/searchindex.js b/v0.3.1/searchindex.js index 803f4f4bcf..6f154115ab 100644 --- a/v0.3.1/searchindex.js +++ b/v0.3.1/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": 5, "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": 5, "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "2023": [], "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": 3, "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": [], "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "andrej": [], "angl": 2, "ani": [1, 2, 3, 5, 6, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": [], "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "articl": [], "artifici": [], "arxiv": 5, "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": [3, 6], "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "baranovskij": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 3, "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": 2, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [1, 2, 6, 7], "box": [1, 2, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "christian": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": [], "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "cool": [], "coordin": 2, "cord": [1, 3, 5], "core": 7, "corner": [], "correct": 6, "correspond": [4, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": 4, "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "found": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gallagh": [], "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ibrahimov": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": [1, 2], "insult": [], "int": [1, 2, 5, 6, 7], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "jame": [], "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": 4, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [3, 5], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "moscardi": [], "most": 5, "mozilla": [], "multi": 3, "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": [], "netraj": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 3, 6, 7], "none": [1, 2, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 3, "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": 1, "order": [1, 2, 5], "org": 5, "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 5, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [4, 5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "parallel": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "patil": [], "pattern": [], "pdf": [2, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [3, 4], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": [], "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "realli": [], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": 4, "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "roboflow": [], "robust": 3, "root": 1, "rotat": [1, 2], "rotated_bbox": [1, 7], "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sanjin": [], "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": [3, 5], "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [1, 4], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standalon": [], "standard": 6, "start": 1, "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": [], "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unit": [], "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "verma": [], "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "video": [], "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": 4, "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "yugesh": [], "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "21": [], "22": [], "27": [], "28": 0, "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": [], "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "resourc": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/v0.3.1/transforms.html b/v0.3.1/transforms.html deleted file mode 100644 index 85e94d8a76..0000000000 --- a/v0.3.1/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.5)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[Callable[[Any], Any]])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[Callable[[Any], Any]])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: Callable[[Any], Any], p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.3.1/using_doctr/custom_models_training.html b/v0.3.1/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/v0.3.1/using_doctr/custom_models_training.html +++ b/v0.3.1/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/v0.3.1/using_doctr/running_on_aws.html b/v0.3.1/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/v0.3.1/using_doctr/running_on_aws.html +++ b/v0.3.1/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/v0.3.1/using_doctr/sharing_models.html b/v0.3.1/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/v0.3.1/using_doctr/sharing_models.html +++ b/v0.3.1/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/v0.3.1/using_doctr/using_contrib_modules.html b/v0.3.1/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.3.1/using_doctr/using_contrib_modules.html +++ b/v0.3.1/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.3.1/using_doctr/using_datasets.html b/v0.3.1/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/v0.3.1/using_doctr/using_datasets.html +++ b/v0.3.1/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/v0.3.1/using_doctr/using_model_export.html b/v0.3.1/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/v0.3.1/using_doctr/using_model_export.html +++ b/v0.3.1/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/v0.3.1/using_doctr/using_models.html b/v0.3.1/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/v0.3.1/using_doctr/using_models.html +++ b/v0.3.1/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/v0.3.1/utils.html b/v0.3.1/utils.html deleted file mode 100644 index e2f223f06a..0000000000 --- a/v0.3.1/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float | None, float | None, float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/datasets/cord.html b/v0.4.0/_modules/doctr/datasets/cord.html index f98ee6901c..55b0584830 100644 --- a/v0.4.0/_modules/doctr/datasets/cord.html +++ b/v0.4.0/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.cord

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
-from doctr.utils.geometry import fit_rbbox
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['CORD']
+__all__ = ["CORD"]
 
 
 
-[docs] +[docs] class CORD(VisionDataset): """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" <https://openreview.net/pdf?id=SJl3z659UH>`_. - Example:: - >>> from doctr.datasets import CORD - >>> train_set = CORD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0 + :align: center + + >>> from doctr.datasets import CORD + >>> train_set = CORD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip', - '45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_test.zip', - '8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0", + "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8", + "cord_train.zip", + ) + + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0", + "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58", + "cord_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - - # # List images - self.root = os.path.join(self._root, 'image') - self.data: List[Tuple[str, Dict[str, Any]]] = [] + # List images + tmp_root = os.path.join(self.root, "image") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] self.train = train - self.sample_transforms = sample_transforms - for img_path in os.listdir(self.root): + np_dtype = np.float32 + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem _targets = [] - with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f: label = json.load(f) for line in label["valid_line"]: for word in line["words"]: if len(word["text"]) > 0: x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] - if rotated_bbox: - box = list(fit_rbbox(np.array([ - [x[0], y[0]], - [x[1], y[1]], - [x[2], y[2]], - [x[3], y[3]], - ], dtype=np.float32))) + box: Union[List[float], np.ndarray] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + box = np.array( + [ + [x[0], y[0]], + [x[1], y[1]], + [x[2], y[2]], + [x[3], y[3]], + ], + dtype=np_dtype, + ) else: - # Reduce 8 coords to 4 + # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax box = [min(x), min(y), max(x), max(y)] - _targets.append((word['text'], box)) + _targets.append((word["text"], box)) text_targets, box_targets = zip(*_targets) - self.data.append(( - img_path, - dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets) - )) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) + ) + for crop, label in zip(crops, list(text_targets)): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -397,8 +461,8 @@

Source code for doctr.datasets.cord

       
     
   
-
- + + diff --git a/v0.4.0/_modules/doctr/datasets/core.html b/v0.4.0/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.4.0/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.4.0/_modules/doctr/datasets/datasets/tensorflow.html deleted file mode 100644 index a236abd9fe..0000000000 --- a/v0.4.0/_modules/doctr/datasets/datasets/tensorflow.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - - - - - doctr.datasets.datasets.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.datasets.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from typing import List, Any, Tuple
-import tensorflow as tf
-
-from .base import _AbstractDataset, _VisionDataset
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset(_AbstractDataset):
-
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-
-        return img, target
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset, _VisionDataset): - pass
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/datasets/detection.html b/v0.4.0/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/v0.4.0/_modules/doctr/datasets/detection.html +++ b/v0.4.0/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.4.0/_modules/doctr/datasets/doc_artefacts.html b/v0.4.0/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/v0.4.0/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.4.0/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.4.0/_modules/doctr/datasets/funsd.html b/v0.4.0/_modules/doctr/datasets/funsd.html index 35d7ad4cf5..f08612f9fa 100644 --- a/v0.4.0/_modules/doctr/datasets/funsd.html +++ b/v0.4.0/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.funsd

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['FUNSD']
+__all__ = ["FUNSD"]
 
 
 
-[docs] +[docs] class FUNSD(VisionDataset): """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. - Example:: - >>> from doctr.datasets import FUNSD - >>> train_set = FUNSD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0 + :align: center + + >>> from doctr.datasets import FUNSD + >>> train_set = FUNSD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip' - SHA256 = 'c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f' - FILE_NAME = 'funsd.zip' + URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip" + SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f" + FILE_NAME = "funsd.zip" def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + super().__init__( + self.URL, + self.FILE_NAME, + self.SHA256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - super().__init__(self.URL, self.FILE_NAME, self.SHA256, True, **kwargs) self.train = train - self.sample_transforms = sample_transforms + np_dtype = np.float32 # Use the subset - subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data') + subfolder = os.path.join("dataset", "training_data" if train else "testing_data") # # List images - self.root = os.path.join(self._root, subfolder, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + tmp_root = os.path.join(self.root, subfolder, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f: data = json.load(f) - _targets = [(word['text'], word['box']) for block in data['form'] - for word in block['words'] if len(word['text']) > 0] + _targets = [ + (word["text"], word["box"]) + for block in data["form"] + for word in block["words"] + if len(word["text"]) > 0 + ] text_targets, box_targets = zip(*_targets) - if rotated_bbox: - # box_targets: xmin, ymin, xmax, ymax -> x, y, w, h, alpha = 0 - box_targets = [ + if use_polygons: + # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners + box_targets = [ # type: ignore[assignment] [ - (box[0] + box[2]) / 2, (box[1] + box[3]) / 2, box[2] - box[0], box[3] - box[1], 0 - ] for box in box_targets + [box[0], box[1]], + [box[2], box[1]], + [box[2], box[3]], + [box[0], box[3]], + ] + for box in box_targets ] - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=int), labels=text_targets))) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype) + ) + for crop, label in zip(crops, list(text_targets)): + # filter labels with unknown characters + if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -386,8 +453,8 @@

Source code for doctr.datasets.funsd

       
     
   
-
- + + diff --git a/v0.4.0/_modules/doctr/datasets/generator/tensorflow.html b/v0.4.0/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/v0.4.0/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.4.0/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.4.0/_modules/doctr/datasets/ic03.html b/v0.4.0/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/v0.4.0/_modules/doctr/datasets/ic03.html +++ b/v0.4.0/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.4.0/_modules/doctr/datasets/ic13.html b/v0.4.0/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/v0.4.0/_modules/doctr/datasets/ic13.html +++ b/v0.4.0/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.4.0/_modules/doctr/datasets/iiit5k.html b/v0.4.0/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/v0.4.0/_modules/doctr/datasets/iiit5k.html +++ b/v0.4.0/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.4.0/_modules/doctr/datasets/iiithws.html b/v0.4.0/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/v0.4.0/_modules/doctr/datasets/iiithws.html +++ b/v0.4.0/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.4.0/_modules/doctr/datasets/imgur5k.html b/v0.4.0/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/v0.4.0/_modules/doctr/datasets/imgur5k.html +++ b/v0.4.0/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.4.0/_modules/doctr/datasets/loader.html b/v0.4.0/_modules/doctr/datasets/loader.html index d32e6da298..ed80350ef0 100644 --- a/v0.4.0/_modules/doctr/datasets/loader.html +++ b/v0.4.0/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.loader

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import math
-import tensorflow as tf
-import numpy as np
-from typing import Optional
+from typing import Callable, Optional
 
-from .multithreading import multithread_exec
+import numpy as np
+import tensorflow as tf
 
 __all__ = ["DataLoader"]
 
@@ -293,12 +314,13 @@ 

Source code for doctr.datasets.loader

     """Collate multiple elements into batches
 
     Args:
+    ----
         samples: list of N tuples containing M elements
 
     Returns:
+    -------
         Tuple of M sequences contianing N elements each
     """
-
     batch_data = zip(*samples)
 
     tf_data = tuple(tf.stack(elt, axis=0) for elt in batch_data)
@@ -307,23 +329,23 @@ 

Source code for doctr.datasets.loader

 
 
 
-[docs] +[docs] class DataLoader: """Implements a dataset wrapper for fast data loading - Example:: - >>> from doctr.datasets import FUNSD, DataLoader - >>> train_set = CORD(train=True, download=True) - >>> train_loader = DataLoader(train_set, batch_size=32) - >>> train_iter = iter(train_loader) - >>> images, targets = next(train_iter) + >>> from doctr.datasets import CORD, DataLoader + >>> train_set = CORD(train=True, download=True) + >>> train_loader = DataLoader(train_set, batch_size=32) + >>> train_iter = iter(train_loader) + >>> images, targets = next(train_iter) Args: + ---- dataset: the dataset shuffle: whether the samples should be shuffled before passing it to the iterator batch_size: number of elements in each batch drop_last: if `True`, drops the last batch if it isn't full - workers: number of workers to use for data loading + collate_fn: function to merge samples into a batch """ def __init__( @@ -332,17 +354,22 @@

Source code for doctr.datasets.loader

         shuffle: bool = True,
         batch_size: int = 1,
         drop_last: bool = False,
-        workers: Optional[int] = None,
+        collate_fn: Optional[Callable] = None,
     ) -> None:
         self.dataset = dataset
         self.shuffle = shuffle
         self.batch_size = batch_size
         nb = len(self.dataset) / batch_size
         self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
-        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
-        self.workers = workers
+        if collate_fn is None:
+            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
+        else:
+            self.collate_fn = collate_fn
         self.reset()
 
+    def __len__(self) -> int:
+        return self.num_batches
+
     def reset(self) -> None:
         # Updates indices after each epoch
         self._num_yielded = 0
@@ -358,9 +385,9 @@ 

Source code for doctr.datasets.loader

         if self._num_yielded < self.num_batches:
             # Get next indices
             idx = self._num_yielded * self.batch_size
-            indices = self.indices[idx: min(len(self.dataset), idx + self.batch_size)]
+            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
 
-            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.workers)
+            samples = list(map(self.dataset.__getitem__, indices))
 
             batch_data = self.collate_fn(samples)
 
@@ -401,8 +428,8 @@ 

Source code for doctr.datasets.loader

       
     
   
-
- +
+ diff --git a/v0.4.0/_modules/doctr/datasets/mjsynth.html b/v0.4.0/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/v0.4.0/_modules/doctr/datasets/mjsynth.html +++ b/v0.4.0/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.4.0/_modules/doctr/datasets/ocr.html b/v0.4.0/_modules/doctr/datasets/ocr.html index 11297d5952..ce1ed8b0d4 100644 --- a/v0.4.0/_modules/doctr/datasets/ocr.html +++ b/v0.4.0/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.ocr

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple
 
-from .datasets import AbstractDataset
-from doctr.utils.geometry import fit_rbbox
+import numpy as np
 
+from .datasets import AbstractDataset
 
-__all__ = ['OCRDataset']
+__all__ = ["OCRDataset"]
 
 
 
-[docs] +[docs] class OCRDataset(AbstractDataset): """Implements an OCR dataset + >>> from doctr.datasets import OCRDataset + >>> train_set = OCRDataset(img_folder="/path/to/images", + >>> label_file="/path/to/labels.json") + >>> img, target = train_set[0] + Args: + ---- img_folder: local path to image folder (all jpg at the root) label_file: local path to the label file - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) - **kwargs: keyword arguments from `VisionDataset`. + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + **kwargs: keyword arguments from `AbstractDataset`. """ def __init__( self, img_folder: str, label_file: str, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, **kwargs: Any, ) -> None: - - self.sample_transforms = sample_transforms - self.root = img_folder + super().__init__(img_folder, **kwargs) # List images self.data: List[Tuple[str, Dict[str, Any]]] = [] - with open(label_file, 'rb') as f: + np_dtype = np.float32 + with open(label_file, "rb") as f: data = json.load(f) - for file_dic in data: + for img_name, annotations in data.items(): # Get image path - img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg' + img_name = Path(img_name) # File existence check if not os.path.exists(os.path.join(self.root, img_name)): raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") # handle empty images - if (len(file_dic["coordinates"]) == 0 or - (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")): - self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[]))) + if len(annotations["typed_words"]) == 0: + self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[]))) continue - is_valid: List[bool] = [] - box_targets: List[List[float]] = [] - for box in file_dic["coordinates"]: - if rotated_bbox: - x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32)) - box = [x, y, w, h, alpha] - is_valid.append(w > 0 and h > 0) - else: - xs, ys = zip(*box) - box = [min(xs), min(ys), max(xs), max(ys)] - is_valid.append(box[0] < box[2] and box[1] < box[3]) - if is_valid[-1]: - box_targets.append(box) + # Unpack the straight boxes (xmin, ymin, xmax, ymax) + geoms = [list(map(float, obj["geometry"][:4])) for obj in annotations["typed_words"]] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + geoms = [ + [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]] # type: ignore[list-item] + for geom in geoms + ] + + text_targets = [obj["value"] for obj in annotations["typed_words"]] - text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid] - self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
+ self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))
@@ -383,8 +402,8 @@

Source code for doctr.datasets.ocr

       
     
   
- - + + diff --git a/v0.4.0/_modules/doctr/datasets/recognition.html b/v0.4.0/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/v0.4.0/_modules/doctr/datasets/recognition.html +++ b/v0.4.0/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.4.0/_modules/doctr/datasets/sroie.html b/v0.4.0/_modules/doctr/datasets/sroie.html index 66fd4ca3e0..04cf10bda2 100644 --- a/v0.4.0/_modules/doctr/datasets/sroie.html +++ b/v0.4.0/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.sroie

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import csv
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['SROIE']
+__all__ = ["SROIE"]
 
 
 
-[docs] +[docs] class SROIE(VisionDataset): """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction" <https://arxiv.org/pdf/2103.10213.pdf>`_. - Example:: - >>> from doctr.datasets import SROIE - >>> train_set = SROIE(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0 + :align: center + + >>> from doctr.datasets import SROIE + >>> train_set = SROIE(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_train_task1.zip', - 'd4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_test.zip', - '41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0", + "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f", + "sroie2019_train_task1.zip", + ) + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0", + "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2", + "sroie2019_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - self.sample_transforms = sample_transforms self.train = train - if rotated_bbox: - raise NotImplementedError + tmp_root = os.path.join(self.root, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + np_dtype = np.float32 - # # List images - self.root = os.path.join(self._root, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - _targets = [] - with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f: - for row in csv.reader(f, delimiter=','): - # Safeguard for blank lines - if len(row) > 0: - # Label may contain commas - label = ",".join(row[8:]) - # Reduce 8 coords to 4 - p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x, p4_y = map(int, row[:8]) - left, right = min(p1_x, p2_x, p3_x, p4_x), max(p1_x, p2_x, p3_x, p4_x) - top, bot = min(p1_y, p2_y, p3_y, p4_y), max(p1_y, p2_y, p3_y, p4_y) - if len(label) > 0: - _targets.append((label, [left, top, right, bot])) - - text_targets, box_targets = zip(*_targets) - - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets))) + with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f: + _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0] + + labels = [",".join(row[8:]) for row in _rows] + # reorder coordinates (8 -> (4,2) -> + # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines + coords: np.ndarray = np.stack( + [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0 + ) + + if not use_polygons: + # xmin, ymin, xmax, ymax + coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1) + + if recognition_task: + crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords) + for crop, label in zip(crops, labels): + if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0: + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, coords)) + else: + self.data.append((img_path, dict(boxes=coords, labels=labels))) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -390,8 +444,8 @@

Source code for doctr.datasets.sroie

       
     
   
-
- + + diff --git a/v0.4.0/_modules/doctr/datasets/svhn.html b/v0.4.0/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/v0.4.0/_modules/doctr/datasets/svhn.html +++ b/v0.4.0/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.4.0/_modules/doctr/datasets/svt.html b/v0.4.0/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/v0.4.0/_modules/doctr/datasets/svt.html +++ b/v0.4.0/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.4.0/_modules/doctr/datasets/synthtext.html b/v0.4.0/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/v0.4.0/_modules/doctr/datasets/synthtext.html +++ b/v0.4.0/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.4.0/_modules/doctr/datasets/utils.html b/v0.4.0/_modules/doctr/datasets/utils.html index 2259698c0f..bde9304597 100644 --- a/v0.4.0/_modules/doctr/datasets/utils.html +++ b/v0.4.0/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.utils

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import string
 import unicodedata
+from collections.abc import Sequence
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Sequence as SequenceType
+
 import numpy as np
-from typing import List, Optional, Any
+from PIL import Image
+
+from doctr.io.image import get_img_shape
+from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
 
 from .vocabs import VOCABS
 
-__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
+__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences", "pre_transform_multiclass"]
+
+ImageTensor = TypeVar("ImageTensor")
 
 
 def translate(
     input_string: str,
     vocab_name: str,
-    unknown_char: str = '■',
+    unknown_char: str = "■",
 ) -> str:
     """Translate a string input in a given vocabulary
 
     Args:
+    ----
         input_string: input string to translate
         vocab_name: vocabulary to use (french, latin, ...)
         unknown_char: unknown character for non-translatable characters
 
     Returns:
-        A string translated in a given vocab"""
-
+    -------
+        A string translated in a given vocab
+    """
     if VOCABS.get(vocab_name) is None:
         raise KeyError("output vocabulary must be in vocabs dictionnary")
 
-    translated = ''
+    translated = ""
     for char in input_string:
         if char not in VOCABS[vocab_name]:
             # we need to translate char into a vocab char
@@ -315,51 +350,63 @@ 

Source code for doctr.datasets.utils

                 # remove whitespaces
                 continue
             # normalize character if it is not in vocab
-            char = unicodedata.normalize('NFD', char).encode('ascii', 'ignore').decode('ascii')
-            if char == '' or char not in VOCABS[vocab_name]:
+            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
+            if char == "" or char not in VOCABS[vocab_name]:
                 # if normalization fails or char still not in vocab, return unknown character)
                 char = unknown_char
         translated += char
     return translated
 
 
-def encode_sequence(
+def encode_string(
     input_string: str,
     vocab: str,
 ) -> List[int]:
     """Given a predefined mapping, encode the string to a sequence of numbers
 
     Args:
+    ----
         input_string: string to encode
         vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A list encoding the input_string"""
-
-    return list(map(vocab.index, input_string))  # type: ignore[arg-type]
+    -------
+        A list encoding the input_string
+    """
+    try:
+        return list(map(vocab.index, input_string))
+    except ValueError:
+        raise ValueError(
+            f"some characters cannot be found in 'vocab'. \
+                         Please check the input string {input_string} and the vocabulary {vocab}"
+        )
 
 
 def decode_sequence(
-    input_array: np.array,
+    input_seq: Union[np.ndarray, SequenceType[int]],
     mapping: str,
 ) -> str:
     """Given a predefined mapping, decode the sequence of numbers to a string
 
     Args:
-        input_array: array to decode
+    ----
+        input_seq: array to decode
         mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A string, decoded from input_array"""
-
-    if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
+    -------
+        A string, decoded from input_seq
+    """
+    if not isinstance(input_seq, (Sequence, np.ndarray)):
+        raise TypeError("Invalid sequence type")
+    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
         raise AssertionError("Input must be an array of int, with max less than mapping size")
-    decoded = ''.join(mapping[idx] for idx in input_array)
-    return decoded
+
+    return "".join(map(mapping.__getitem__, input_seq))
 
 
 
-[docs] +[docs] def encode_sequences( sequences: List[str], vocab: str, @@ -367,48 +414,53 @@

Source code for doctr.datasets.utils

     eos: int = -1,
     sos: Optional[int] = None,
     pad: Optional[int] = None,
-    **kwargs: Any,
+    dynamic_seq_length: bool = False,
 ) -> np.ndarray:
     """Encode character sequences using a given vocab as mapping
 
     Args:
+    ----
         sequences: the list of character sequences of size N
         vocab: the ordered vocab to use for encoding
         target_size: maximum length of the encoded data
         eos: encoding of End Of String
         sos: optional encoding of Start Of String
         pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
+        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
 
     Returns:
+    -------
         the padded encoded data as a tensor
     """
-
     if 0 <= eos < len(vocab):
         raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
 
-    if not isinstance(target_size, int):
-        target_size = max(len(w) for w in sequences)
-        if sos:
-            target_size += 1
-        if pad:
-            target_size += 1
+    if not isinstance(target_size, int) or dynamic_seq_length:
+        # Maximum string length + EOS
+        max_length = max(len(w) for w in sequences) + 1
+        if isinstance(sos, int):
+            max_length += 1
+        if isinstance(pad, int):
+            max_length += 1
+        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
 
     # Pad all sequences
-    if pad:  # pad with padding symbol
+    if isinstance(pad, int):  # pad with padding symbol
         if 0 <= pad < len(vocab):
             raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
         # In that case, add EOS at the end of the word before padding
-        encoded_data = np.full([len(sequences), target_size], pad, dtype=np.int32)
+        default_symbol = pad
     else:  # pad with eos symbol
-        encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32)
+        default_symbol = eos
+    encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
 
-    for idx, seq in enumerate(sequences):
-        encoded_seq = encode_sequence(seq, vocab)
-        if pad:  # add eos at the end of the sequence
-            encoded_seq.append(eos)
-        encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)]
+    # Encode the strings
+    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
+        if isinstance(pad, int):  # add eos at the end of the sequence
+            seq.append(eos)
+        encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)]
 
-    if sos:  # place eos symbol at the beginning of each sequence
+    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
         if 0 <= sos < len(vocab):
             raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
         encoded_data = np.roll(encoded_data, 1)
@@ -416,6 +468,59 @@ 

Source code for doctr.datasets.utils

 
     return encoded_data
+ + +def convert_target_to_relative( + img: ImageTensor, target: Union[np.ndarray, Dict[str, Any]] +) -> Tuple[ImageTensor, Union[Dict[str, Any], np.ndarray]]: + if isinstance(target, np.ndarray): + target = convert_to_relative_coords(target, get_img_shape(img)) + else: + target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img)) + return img, target + + +def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]: + """Crop a set of bounding boxes from an image + + Args: + ---- + img_path: path to the image + geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4) + + Returns: + ------- + a list of cropped images + """ + with Image.open(img_path) as pil_img: + img: np.ndarray = np.array(pil_img.convert("RGB")) + # Polygon + if geoms.ndim == 3 and geoms.shape[1:] == (4, 2): + return extract_rcrops(img, geoms.astype(dtype=int)) + if geoms.ndim == 2 and geoms.shape[1] == 4: + return extract_crops(img, geoms.astype(dtype=int)) + raise ValueError("Invalid geometry format") + + +def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.ndarray, Dict[str, List]]: + """Converts multiclass target to relative coordinates. + + Args: + ---- + img: Image + target: tuple of target polygons and their classes names + + Returns: + ------- + Image and dictionary of boxes, with class names as keys + """ + boxes = convert_to_relative_coords(target[0], get_img_shape(img)) + boxes_classes = target[1] + boxes_dict: Dict = {k: [] for k in sorted(set(boxes_classes))} + for k, poly in zip(boxes_classes, boxes): + boxes_dict[k].append(poly) + boxes_dict = {k: np.stack(v, axis=0) for k, v in boxes_dict.items()} + return img, boxes_dict
@@ -448,8 +553,8 @@

Source code for doctr.datasets.utils

       
     
   
- - + + diff --git a/v0.4.0/_modules/doctr/datasets/wildreceipt.html b/v0.4.0/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.4.0/_modules/doctr/datasets/wildreceipt.html +++ b/v0.4.0/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.4.0/_modules/doctr/documents/elements.html b/v0.4.0/_modules/doctr/documents/elements.html deleted file mode 100644 index 10c1e142d2..0000000000 --- a/v0.4.0/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,577 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox, RotatedBbox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator, arg-type] - - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/documents/reader.html b/v0.4.0/_modules/doctr/documents/reader.html deleted file mode 100644 index cdcd814b6c..0000000000 --- a/v0.4.0/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,612 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence, Dict
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/io/elements.html b/v0.4.0/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/v0.4.0/_modules/doctr/io/elements.html +++ b/v0.4.0/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.4.0/_modules/doctr/io/html.html b/v0.4.0/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/v0.4.0/_modules/doctr/io/html.html +++ b/v0.4.0/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.4.0/_modules/doctr/io/image/base.html b/v0.4.0/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/v0.4.0/_modules/doctr/io/image/base.html +++ b/v0.4.0/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.4.0/_modules/doctr/io/image/tensorflow.html b/v0.4.0/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/v0.4.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.4.0/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.4.0/_modules/doctr/io/pdf.html b/v0.4.0/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/v0.4.0/_modules/doctr/io/pdf.html +++ b/v0.4.0/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.4.0/_modules/doctr/io/reader.html b/v0.4.0/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/v0.4.0/_modules/doctr/io/reader.html +++ b/v0.4.0/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.4.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/v0.4.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.4.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/v0.4.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.4.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/v0.4.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.4.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.4.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.4.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/v0.4.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.4.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/v0.4.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.4.0/_modules/doctr/models/classification/zoo.html b/v0.4.0/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/v0.4.0/_modules/doctr/models/classification/zoo.html +++ b/v0.4.0/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.4.0/_modules/doctr/models/detection/differentiable_binarization.html b/v0.4.0/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.4.0/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.4.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 9145c7c3fd..66cef8663d 100644 --- a/v0.4.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import List, Tuple, Optional, Any, Dict
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+from tensorflow.keras.applications import ResNet50
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
+
+from ...classification import mobilenet_v3_large
 from .base import DBPostProcessor, _DBNet
 
-__all__ = ['DBNet', 'db_resnet50']
+__all__ = ["DBNet", "db_resnet50", "db_mobilenet_v3_large"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
+    "db_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_resnet50-649fa22b.weights.h5&src=0",
+    },
+    "db_mobilenet_v3_large": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_mobilenet_v3_large-ee2e1dbe.weights.h5&src=0",
     },
 }
 
@@ -313,6 +348,7 @@ 

Source code for doctr.models.detection.differentiable_binarization.tensorflo <https://arxiv.org/pdf/1612.03144.pdf>`_. Args: + ---- channels: number of channel to output """ @@ -322,9 +358,9 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo ) -> None: super().__init__() self.channels = channels - self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest') - self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)] - self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)] + self.upsample = layers.UpSampling2D(size=(2, 2), interpolation="nearest") + self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer="he_normal") for _ in range(4)] + self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2**idx) for idx in range(4)] @staticmethod def build_upsampling( @@ -334,20 +370,21 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo """Module which performs a 3x3 convolution followed by up-sampling Args: + ---- channels: number of output channels dilation_factor (int): dilation factor to scale the convolution output before concatenation Returns: + ------- a keras.layers.Layer object, wrapping these operations in a sequential module """ - - _layers = conv_sequence(channels, 'relu', True, kernel_size=3) + _layers = conv_sequence(channels, "relu", True, kernel_size=3) if dilation_factor > 1: - _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest')) + _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation="nearest")) - module = keras.Sequential(_layers) + module = Sequential(_layers) return module @@ -359,7 +396,6 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo x: List[tf.Tensor], **kwargs: Any, ) -> tf.Tensor: - # Channel mapping results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)] # Upsample & sum @@ -371,200 +407,324 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo return layers.concatenate(results) -class DBNet(_DBNet, keras.Model, NestedObject): +class DBNet(_DBNet, Model, NestedObject): """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_. Args: + ---- feature extractor: the backbone serving as feature extractor fpn_channels: number of channels each extracted feature maps is mapped to + bin_thresh: threshold for binarization + box_thresh: minimal objectness score to consider a box + assume_straight_pages: if True, fit straight bounding boxes only + exportable: onnx exportable returns only logits + cfg: the configuration dict of the model + class_names: list of class names """ - _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "fpn", "probability_head", "threshold_head", "postprocessor"] def __init__( self, feature_extractor: IntermediateLayerGetter, - fpn_channels: int = 128, - rotated_bbox: bool = False, + fpn_channels: int = 128, # to be set to 256 to represent the author's initial idea + bin_thresh: float = 0.3, + box_thresh: float = 0.1, + assume_straight_pages: bool = True, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, + class_names: List[str] = [CLASS_NAME], ) -> None: - super().__init__() + self.class_names = class_names + num_classes: int = len(self.class_names) self.cfg = cfg self.feat_extractor = feature_extractor - self.rotated_bbox = rotated_bbox + self.exportable = exportable + self.assume_straight_pages = assume_straight_pages self.fpn = FeaturePyramidNetwork(channels=fpn_channels) # Initialize kernels _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape] output_shape = tuple(self.fpn(_inputs).shape) - self.probability_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] + self.probability_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + self.threshold_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + + self.postprocessor = DBPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh ) - self.threshold_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] - ) - - self.postprocessor = DBPostProcessor(rotated_bbox=rotated_bbox) def compute_loss( self, out_map: tf.Tensor, thresh_map: tf.Tensor, - target: List[Dict[str, Any]] + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes and a list of masks for each image. From there it computes the loss with the model output Args: + ---- out_map: output feature map of the model of shape (N, H, W, C) thresh_map: threshold map of shape (N, H, W, C) target: list of dictionary where each dict has a `boxes` and a `flags` entry + gamma: modulating factor in the focal loss formula + alpha: balancing factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") - prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1])) - thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1])) + prob_map = tf.math.sigmoid(out_map) + thresh_map = tf.math.sigmoid(thresh_map) - seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) + seg_target, seg_mask, thresh_target, thresh_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32) + seg_mask = tf.cast(seg_mask, tf.float32) + thresh_target = tf.convert_to_tensor(thresh_target, dtype=out_map.dtype) thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool) - # Compute balanced BCE loss for proba_map - bce_scale = 5. - bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask] - - neg_target = 1 - seg_target[seg_mask] - positive_count = tf.math.reduce_sum(seg_target[seg_mask]) - negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count]) - negative_loss = bce_loss * neg_target - negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32)) - sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss) - balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6) - - # Compute dice loss for approxbin_map - bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask]))) - - bce_min = tf.math.reduce_min(bce_loss) - weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1. - inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights) - union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8 - dice_loss = 1 - 2.0 * inter / union + # Focal loss + focal_scale = 10.0 + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + + # Convert logits to prob, compute gamma factor + p_t = (seg_target * prob_map) + ((1 - seg_target) * (1 - prob_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class or for approx binary_map + if len(self.class_names) > 1: + dice_map = tf.nn.softmax(out_map, axis=-1) + else: + # compute binary map instead + dice_map = 1.0 / (1.0 + tf.exp(-50 * (prob_map - thresh_map))) + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) # Compute l1 loss for thresh_map - l1_scale = 10. if tf.reduce_any(thresh_mask): - l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask])) + thresh_mask = tf.cast(thresh_mask, tf.float32) + l1_loss = tf.reduce_sum(tf.abs(thresh_map - thresh_target) * thresh_mask) / ( + tf.reduce_sum(thresh_mask) + eps + ) else: - l1_loss = tf.constant(0.) + l1_loss = tf.constant(0.0) - return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss + return l1_loss + focal_scale * focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - feat_maps = self.feat_extractor(x, **kwargs) feat_concat = self.fpn(feat_maps, **kwargs) logits = self.probability_head(feat_concat, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: - # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + if target is None or return_preds: + # Post-process boxes (keep only text predictions) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: thresh_map = self.threshold_head(feat_concat, **kwargs) loss = self.compute_loss(logits, thresh_map, target) - out['loss'] = loss + out["loss"] = loss return out -def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet: +def _db_resnet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) # Feature extractor - resnet = tf.keras.applications.__dict__[_cfg['backbone']]( - include_top=False, - weights=None, - input_shape=_cfg['input_shape'], - pooling=None, + feat_extractor = IntermediateLayerGetter( + backbone_fn( + weights="imagenet" if pretrained_backbone else None, + include_top=False, + pooling=None, + input_shape=_cfg["input_shape"], + ), + fpn_layers, ) + # Build the model + model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + + # Load pretrained parameters + if pretrained: + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) + + return model + + +def _db_mobilenet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained + + # Patch the config + _cfg = deepcopy(default_cfgs[arch]) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = default_cfgs[arch].get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor feat_extractor = IntermediateLayerGetter( - resnet, - _cfg['fpn_layers'], + backbone_fn( + input_shape=_cfg["input_shape"], + include_top=False, + pretrained=pretrained_backbone, + ), + fpn_layers, ) - kwargs['fpn_channels'] = _cfg['fpn_channels'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] - # Build the model model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model
-[docs] +[docs] def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture Returns: + ------- text detection architecture """ + return _db_resnet( + "db_resnet50", + pretrained, + ResNet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
+ + + +
+[docs] +def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet: + """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" + <https://arxiv.org/pdf/1911.08947.pdf>`_, using a mobilenet v3 large backbone. + + >>> import tensorflow as tf + >>> from doctr.models import db_mobilenet_v3_large + >>> model = db_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) - return _db_resnet('db_resnet50', pretrained, **kwargs)
+ Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture + + Returns: + ------- + text detection architecture + """ + return _db_mobilenet( + "db_mobilenet_v3_large", + pretrained, + mobilenet_v3_large, + ["inverted_2", "inverted_5", "inverted_11", "final_block"], + **kwargs, + )

@@ -598,8 +758,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo - - + + diff --git a/v0.4.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.4.0/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.4.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.4.0/_modules/doctr/models/detection/linknet.html b/v0.4.0/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.4.0/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.4.0/_modules/doctr/models/detection/linknet/tensorflow.html index cd4f446673..ce995f99d4 100644 --- a/v0.4.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.linknet.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.classification import resnet18, resnet34, resnet50
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.backbones import ResnetStage
-from doctr.models.utils import conv_sequence, load_pretrained_params
-from .base import LinkNetPostProcessor, _LinkNet
 
-__all__ = ['LinkNet', 'linknet16']
+from .base import LinkNetPostProcessor, _LinkNet
 
+__all__ = ["LinkNet", "linknet_resnet18", "linknet_resnet34", "linknet_resnet50"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet16': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'num_classes': 1,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': None,
+    "linknet_resnet18": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet18-615a82c5.weights.h5&src=0",
+    },
+    "linknet_resnet34": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet34-9d772be5.weights.h5&src=0",
+    },
+    "linknet_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet50-6bf6c8b5.weights.h5&src=0",
     },
 }
 
 
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
+def decoder_block(in_chan: int, out_chan: int, stride: int, **kwargs: Any) -> Sequential:
     """Creates a LinkNet decoder block"""
-
     return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
+        *conv_sequence(in_chan // 4, "relu", True, kernel_size=1, **kwargs),
         layers.Conv2DTranspose(
             filters=in_chan // 4,
             kernel_size=3,
-            strides=2,
+            strides=stride,
             padding="same",
             use_bias=False,
-            kernel_initializer='he_normal'
+            kernel_initializer="he_normal",
         ),
         layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
+        layers.Activation("relu"),
+        *conv_sequence(out_chan, "relu", True, kernel_size=1),
     ])
 
 
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module"""
+class LinkNetFPN(Model, NestedObject):
+    """LinkNet Decoder module"""
 
     def __init__(
         self,
+        out_chans: int,
+        in_shapes: List[Tuple[int, ...]],
     ) -> None:
-
         super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
+        self.out_chans = out_chans
+        strides = [2] * (len(in_shapes) - 1) + [1]
+        i_chans = [s[-1] for s in in_shapes[::-1]]
+        o_chans = i_chans[1:] + [out_chans]
+        self.decoders = [
+            decoder_block(in_chan, out_chan, s, input_shape=in_shape)
+            for in_chan, out_chan, s, in_shape in zip(i_chans, o_chans, strides, in_shapes[::-1])
+        ]
+
+    def call(self, x: List[tf.Tensor], **kwargs: Any) -> tf.Tensor:
+        out = 0
+        for decoder, fmap in zip(self.decoders, x[::-1]):
+            out = decoder(out + fmap, **kwargs)
+        return out
 
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(_LinkNet, keras.Model):
+    def extra_repr(self) -> str:
+        return f"out_chans={self.out_chans}"
+
+
+class LinkNet(_LinkNet, Model):
     """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
     <https://arxiv.org/pdf/1707.03718.pdf>`_.
 
     Args:
-        num_classes: number of channels for the output
+    ----
+        feature extractor: the backbone serving as feature extractor
+        fpn_channels: number of channels each extracted feature maps is mapped to
+        bin_thresh: threshold for binarization of the output feature map
+        box_thresh: minimal objectness score to consider a box
+        assume_straight_pages: if True, fit straight bounding boxes only
+        exportable: onnx exportable returns only logits
+        cfg: the configuration dict of the model
+        class_names: list of class names
     """
 
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
+    _children_names: List[str] = ["feat_extractor", "fpn", "classifier", "postprocessor"]
 
     def __init__(
         self,
-        num_classes: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        rotated_bbox: bool = False,
+        feat_extractor: IntermediateLayerGetter,
+        fpn_channels: int = 64,
+        bin_thresh: float = 0.1,
+        box_thresh: float = 0.1,
+        assume_straight_pages: bool = True,
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
+        class_names: List[str] = [CLASS_NAME],
     ) -> None:
         super().__init__(cfg=cfg)
 
-        self.rotated_bbox = rotated_bbox
+        self.class_names = class_names
+        num_classes: int = len(self.class_names)
 
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
+        self.exportable = exportable
+        self.assume_straight_pages = assume_straight_pages
+
+        self.feat_extractor = feat_extractor
 
-        self.fpn = LinkNetFPN()
+        self.fpn = LinkNetFPN(fpn_channels, [_shape[1:] for _shape in self.feat_extractor.output_shape])
+        self.fpn.build(self.feat_extractor.output_shape)
 
         self.classifier = Sequential([
             layers.Conv2DTranspose(
@@ -393,154 +442,246 @@ 

Source code for doctr.models.detection.linknet.tensorflow

strides=2, padding="same", use_bias=False, - kernel_initializer='he_normal' + kernel_initializer="he_normal", + input_shape=self.fpn.decoders[-1].output_shape[1:], ), layers.BatchNormalization(), - layers.Activation('relu'), - *conv_sequence(32, 'relu', True, strides=1, kernel_size=3), + layers.Activation("relu"), + *conv_sequence(32, "relu", True, kernel_size=3, strides=1), layers.Conv2DTranspose( filters=num_classes, kernel_size=2, strides=2, padding="same", - use_bias=False, - kernel_initializer='he_normal' + use_bias=True, + kernel_initializer="he_normal", ), ]) - self.postprocessor = LinkNetPostProcessor(rotated_bbox=rotated_bbox) + self.postprocessor = LinkNetPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh + ) def compute_loss( self, out_map: tf.Tensor, - target: List[Dict[str, Any]], - focal_loss: bool = False, - alpha: float = .5, - gamma: float = 2., - edge_factor: float = 2., + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute linknet loss, BCE with boosted box edges or focal loss. Focal loss implementation based on <https://github.com/tensorflow/addons/>`_. Args: + ---- out_map: output feature map of the model of shape N x H x W x 1 target: list of dictionary where each dict has a `boxes` and a `flags` entry - focal_loss: if True, use focal loss instead of BCE - edge_factor: boost factor for box edges (in case of BCE) + gamma: modulating factor in the focal loss formula alpha: balancing factor in the focal loss formula - gammma: modulating factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ - seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) - edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) + seg_target, seg_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - - # Get the cross_entropy for each entry - bce = tf.keras.losses.binary_crossentropy( - seg_target[seg_mask], - tf.squeeze(out_map, axis=[-1])[seg_mask], - from_logits=True) - - if focal_loss: - if gamma and gamma < 0: - raise ValueError("Value of gamma should be greater than or equal to zero.") - - # Convert logits to prob, compute gamma factor - pred_prob = tf.sigmoid(tf.squeeze(out_map, axis=[-1])[seg_mask]) - p_t = (seg_target[seg_mask] * pred_prob) + ((1 - seg_target[seg_mask]) * (1 - pred_prob)) - modulating_factor = tf.pow((1.0 - p_t), gamma) - - # Compute alpha factor - alpha_factor = seg_target[seg_mask] * alpha + (1 - seg_target[seg_mask]) * (1 - alpha) - - # compute the final loss - loss = tf.reduce_mean(alpha_factor * modulating_factor * bce) - - else: - # Compute BCE loss with highlighted edges - loss = tf.math.multiply( - 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), - bce - ) - loss = tf.reduce_mean(loss) - - return loss + seg_mask = tf.cast(seg_mask, tf.float32) + + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + proba_map = tf.sigmoid(out_map) + + # Focal loss + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") + # Convert logits to prob, compute gamma factor + p_t = (seg_target * proba_map) + ((1 - seg_target) * (1 - proba_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class + dice_map = tf.nn.softmax(out_map, axis=-1) if len(self.class_names) > 1 else proba_map + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) + + return focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, - focal_loss: bool = True, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - - logits = self.stem(x) - logits = self.fpn(logits) - logits = self.classifier(logits) + feat_maps = self.feat_extractor(x, **kwargs) + logits = self.fpn(feat_maps, **kwargs) + logits = self.classifier(logits, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) + if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: + if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: - loss = self.compute_loss(logits, target, focal_loss) - out['loss'] = loss + loss = self.compute_loss(logits, target) + out["loss"] = loss return out -def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet: +def _linknet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> LinkNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor + feat_extractor = IntermediateLayerGetter( + backbone_fn( + pretrained=pretrained_backbone, + include_top=False, + input_shape=_cfg["input_shape"], + ), + fpn_layers, + ) - kwargs['num_classes'] = _cfg['num_classes'] - kwargs['input_shape'] = _cfg['input_shape'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model - model = LinkNet(cfg=_cfg, **kwargs) + model = LinkNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model -
-[docs] -def linknet16(pretrained: bool = False, **kwargs: Any) -> LinkNet: +
+[docs] +def linknet_resnet18(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet18 + >>> model = linknet_resnet18(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture + + Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet18", + pretrained, + resnet18, + ["resnet_block_1", "resnet_block_3", "resnet_block_5", "resnet_block_7"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet34(pretrained: bool = False, **kwargs: Any) -> LinkNet: """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" <https://arxiv.org/pdf/1707.03718.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet16 - >>> model = linknet16(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet34 + >>> model = linknet_resnet34(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture Returns: + ------- text detection architecture """ + return _linknet( + "linknet_resnet34", + pretrained, + resnet34, + ["resnet_block_2", "resnet_block_6", "resnet_block_12", "resnet_block_15"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet50(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet50 + >>> model = linknet_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture - return _linknet('linknet16', pretrained, **kwargs)
+ Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet50", + pretrained, + resnet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
@@ -574,8 +715,8 @@

Source code for doctr.models.detection.linknet.tensorflow

- +
+ diff --git a/v0.4.0/_modules/doctr/models/detection/zoo.html b/v0.4.0/_modules/doctr/models/detection/zoo.html index d3128b8d14..3651c4e2d3 100644 --- a/v0.4.0/_modules/doctr/models/detection/zoo.html +++ b/v0.4.0/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
 from doctr.file_utils import is_tf_available, is_torch_available
-from .core import DetectionPredictor
-from ..preprocessor import PreProcessor
-from .. import detection
 
+from .. import detection
+from ..detection.fast import reparameterize
+from ..preprocessor import PreProcessor
+from .predictor import DetectionPredictor
 
 __all__ = ["detection_predictor"]
 
+ARCHS: List[str]
+
 
 if is_tf_available():
-    ARCHS = ['db_resnet50', 'linknet16']
+    ARCHS = [
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
 elif is_torch_available():
-    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
+    ARCHS = [
+        "db_resnet34",
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
+
 
+def _predictor(arch: Any, pretrained: bool, assume_straight_pages: bool = True, **kwargs: Any) -> DetectionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
+        _model = detection.__dict__[arch](
+            pretrained=pretrained,
+            pretrained_backbone=kwargs.get("pretrained_backbone", True),
+            assume_straight_pages=assume_straight_pages,
+        )
+        # Reparameterize FAST models by default to lower inference latency and memory usage
+        if isinstance(_model, detection.FAST):
+            _model = reparameterize(_model)
+    else:
+        if not isinstance(arch, (detection.DBNet, detection.LinkNet, detection.FAST)):
+            raise ValueError(f"unknown architecture: {type(arch)}")
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+        _model = arch
+        _model.assume_straight_pages = assume_straight_pages
+        _model.postprocessor.assume_straight_pages = assume_straight_pages
 
-    # Detection
-    _model = detection.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 1)
+    kwargs.pop("pretrained_backbone", None)
+
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 2)
     predictor = DetectionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
-        _model
+        PreProcessor(_model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:], **kwargs),
+        _model,
     )
     return predictor
 
 
 
-[docs] -def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: +[docs] +def detection_predictor( + arch: Any = "fast_base", + pretrained: bool = False, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + batch_size: int = 2, + **kwargs: Any, +) -> DetectionPredictor: """Text detection architecture. - Example:: - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> model = detection_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import detection_predictor + >>> model = detection_predictor(arch='db_resnet50', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_resnet50') + ---- + arch: name of the architecture or model itself to use (e.g. 'db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset + assume_straight_pages: If True, fit straight boxes to the page + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right + batch_size: number of samples the model processes in parallel + **kwargs: optional keyword arguments passed to the architecture Returns: + ------- Detection predictor """ - - return _predictor(arch, pretrained, **kwargs)
+ return _predictor( + arch=arch, + pretrained=pretrained, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + batch_size=batch_size, + **kwargs, + )
@@ -367,8 +449,8 @@

Source code for doctr.models.detection.zoo

       
     
   
- - + + diff --git a/v0.4.0/_modules/doctr/models/export.html b/v0.4.0/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.4.0/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/models/factory/hub.html b/v0.4.0/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/v0.4.0/_modules/doctr/models/factory/hub.html +++ b/v0.4.0/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.4.0/_modules/doctr/models/recognition/crnn.html b/v0.4.0/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.4.0/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.4.0/_modules/doctr/models/recognition/crnn/tensorflow.html index 41cc93dd23..bc64da9a1b 100644 --- a/v0.4.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.crnn.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
+
 import tensorflow as tf
 from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential, Model
-from typing import Tuple, Dict, Any, Optional, List
+from tensorflow.keras.models import Model, Sequential
+
+from doctr.datasets import VOCABS
 
-from ... import backbones
-from ...utils import load_pretrained_params
+from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
+__all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
+    "crnn_vgg16_bn": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["legacy_french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_vgg16_bn-9c188f45.weights.h5&src=0",
     },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
+    "crnn_mobilenet_v3_small": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_small-54850265.weights.h5&src=0",
+    },
+    "crnn_mobilenet_v3_large": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_large-c64045e5.weights.h5&src=0",
     },
 }
 
 
 class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
+    """Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
 
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
         ignore_case: if True, ignore case of letters
         ignore_accents: if True, ignore accents of letters
@@ -325,37 +353,57 @@ 

Source code for doctr.models.recognition.crnn.tensorflow

def __call__( self, - logits: tf.Tensor - ) -> List[Tuple[str, float]]: - """ - Performs decoding of raw output with CTC and decoding of CTC predictions + logits: tf.Tensor, + beam_width: int = 1, + top_paths: int = 1, + ) -> Union[List[Tuple[str, float]], List[Tuple[List[str], List[float]]]]: + """Performs decoding of raw output with CTC and decoding of CTC predictions with label_to_idx mapping dictionnary Args: + ---- logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1 + beam_width: An int scalar >= 0 (beam search beam width). + top_paths: An int scalar >= 0, <= beam_width (controls output size). Returns: + ------- A list of decoded words of length BATCH_SIZE + """ # Decode CTC _decoded, _log_prob = tf.nn.ctc_beam_search_decoder( tf.transpose(logits, perm=[1, 0, 2]), - tf.fill(logits.shape[0], logits.shape[1]), - beam_width=1, top_paths=1, + tf.fill(tf.shape(logits)[:1], tf.shape(logits)[1]), + beam_width=beam_width, + top_paths=top_paths, ) - out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab)) - probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) + + _decoded = tf.sparse.concat( + 1, + [tf.sparse.expand_dims(dec, axis=1) for dec in _decoded], + expand_nonconcat_dims=True, + ) # dim : batchsize x beamwidth x actual_max_len_predictions + out_idxs = tf.sparse.to_dense(_decoded, default_value=len(self.vocab)) # Map it to characters _decoded_strings_pred = tf.strings.reduce_join( inputs=tf.nn.embedding_lookup(tf.constant(self._embedding, dtype=tf.string), out_idxs), - axis=-1 + axis=-1, ) _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] - word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - + decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value="not valid")[ + :, :, 0 + ] # dim : batch_size x beam_width + + if top_paths == 1: + probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) # dim : batchsize + decoded_strings_pred = tf.squeeze(decoded_strings_pred, axis=1) + word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] + else: + probs = tf.math.exp(_log_prob) # dim : batchsize x beamwidth + word_values = [[word.decode() for word in words] for words in decoded_strings_pred.numpy().tolist()] return list(zip(word_values, probs.numpy().tolist())) @@ -364,19 +412,26 @@

Source code for doctr.models.recognition.crnn.tensorflow

Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of units in the LSTM layers + exportable: onnx exportable returns only logits + beam_width: beam width for beam search decoding + top_paths: number of top paths for beam search decoding cfg: configuration dictionary """ - _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "decoder", "postprocessor"] def __init__( self, - feature_extractor: tf.keras.Model, + feature_extractor: Model, vocab: str, rnn_units: int = 128, + exportable: bool = False, + beam_width: int = 1, + top_paths: int = 1, cfg: Optional[Dict[str, Any]] = None, ) -> None: # Initialize kernels @@ -386,19 +441,21 @@

Source code for doctr.models.recognition.crnn.tensorflow

self.vocab = vocab self.max_length = w self.cfg = cfg + self.exportable = exportable self.feat_extractor = feature_extractor - self.decoder = Sequential( - [ - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Dense(units=len(vocab) + 1) - ] - ) + self.decoder = Sequential([ + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Dense(units=len(vocab) + 1), + ]) self.decoder.build(input_shape=(None, w, h * c)) self.postprocessor = CTCPostProcessor(vocab=vocab) + self.beam_width = beam_width + self.top_paths = top_paths + def compute_loss( self, model_output: tf.Tensor, @@ -407,16 +464,17 @@

Source code for doctr.models.recognition.crnn.tensorflow

"""Compute CTC loss for the model. Args: - gt: the encoded tensor with gt labels + ---- model_output: predicted logits of the model - seq_len: lengths of each gt word inside the batch + target: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) batch_len = model_output.shape[0] - input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) + input_length = tf.fill((batch_len,), model_output.shape[1]) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -428,8 +486,12 @@

Source code for doctr.models.recognition.crnn.tensorflow

target: Optional[List[str]] = None, return_model_output: bool = False, return_preds: bool = False, + beam_width: int = 1, + top_paths: int = 1, **kwargs: Any, ) -> Dict[str, Any]: + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") features = self.feat_extractor(x, **kwargs) # B x H x W x C --> B x W x H x C @@ -437,91 +499,132 @@

Source code for doctr.models.recognition.crnn.tensorflow

w, h, c = transposed_feat.get_shape().as_list()[1:] # B x W x H x C --> B x W x H * C features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c)) - logits = self.decoder(features_seq, **kwargs) + logits = _bf16_to_float32(self.decoder(features_seq, **kwargs)) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = logits + return out + if return_model_output: out["out_map"] = logits if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(logits) + out["preds"] = self.postprocessor(logits, beam_width=beam_width, top_paths=top_paths) if target is not None: - out['loss'] = self.compute_loss(logits, target) + out["loss"] = self.compute_loss(logits, target) return out -def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: +def _crnn( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> CRNN: + pretrained_backbone = pretrained_backbone and not pretrained + + kwargs["vocab"] = kwargs.get("vocab", default_cfgs[arch]["vocab"]) - # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) + _cfg["vocab"] = kwargs["vocab"] + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] - # Feature extractor - feat_extractor = backbones.__dict__[_cfg['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + input_shape=_cfg["input_shape"], include_top=False, + pretrained=pretrained_backbone, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - # Build the model model = CRNN(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params(model, _cfg["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]) return model
-[docs] +[docs] def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_vgg16_bn + >>> model = crnn_vgg16_bn(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_vgg16_bn", pretrained, vgg16_bn_r, **kwargs)
+ + + +
+[docs] +def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based + Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_small + >>> model = crnn_mobilenet_v3_small(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
+ Returns: + ------- + text recognition architecture + """ + return _crnn("crnn_mobilenet_v3_small", pretrained, mobilenet_v3_small_r, **kwargs)
-def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based +
+[docs] +def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_large + >>> model = crnn_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_mobilenet_v3_large", pretrained, mobilenet_v3_large_r, **kwargs)
- return _crnn('crnn_resnet31', pretrained, **kwargs)
@@ -554,8 +657,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

- +
+ diff --git a/v0.4.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.4.0/_modules/doctr/models/recognition/master/tensorflow.html index 2dc5a27717..aa6aa69325 100644 --- a/v0.4.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.master.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import tensorflow as tf
-from tensorflow.keras import layers, Sequential, Model
-from typing import Tuple, List, Dict, Any, Optional
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
 
-from ..core import RecognitionPostProcessor
-from ...backbones.resnet import ResnetStage
-from ...utils import conv_sequence, load_pretrained_params
-from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask
-from ....datasets import VOCABS
-from .base import _MASTER, _MASTERPostProcessor
+import tensorflow as tf
+from tensorflow.keras import Model, layers
+
+from doctr.datasets import VOCABS
+from doctr.models.classification import magc_resnet31
+from doctr.models.modules.transformer import Decoder, PositionalEncoding
 
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from .base import _MASTER, _MASTERPostProcessor
 
-__all__ = ['MASTER', 'master', 'MASTERPostProcessor']
+__all__ = ["MASTER", "master"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'master': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'input_shape': (48, 160, 3),
-        'vocab': VOCABS['french'],
-        'url': None,
+    "master": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/master-d7fdaeff.weights.h5&src=0",
     },
 }
 
 
-class MAGC(layers.Layer):
-
-    """Implements the Multi-Aspect Global Context Attention, as described in
-    <https://arxiv.org/pdf/1910.02562.pdf>`_.
-
-    Args:
-        inplanes: input channels
-        headers: number of headers to split channels
-        att_scale: if True, re-scale attention to counteract the variance distibutions
-        **kwargs
-    """
-
-    def __init__(
-        self,
-        inplanes: int,
-        headers: int = 1,
-        att_scale: bool = False,
-        **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-
-        self.headers = headers  # h
-        self.inplanes = inplanes  # C
-        self.att_scale = att_scale
-
-        self.single_header_inplanes = int(inplanes / headers)  # C / h
-
-        self.conv_mask = tf.keras.layers.Conv2D(
-            filters=1,
-            kernel_size=1,
-            kernel_initializer=tf.initializers.he_normal()
-        )
-
-        self.transform = tf.keras.Sequential(
-            [
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-                tf.keras.layers.LayerNormalization([1, 2, 3]),
-                tf.keras.layers.ReLU(),
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-            ],
-            name='transform'
-        )
-
-    @tf.function
-    def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor:
-        b, h, w, c = (tf.shape(inputs)[i] for i in range(4))
-
-        # B, H, W, C -->> B*h, H, W, C/h
-        x = tf.reshape(inputs, shape=(b, h, w, self.headers, self.single_header_inplanes))
-        x = tf.transpose(x, perm=(0, 3, 1, 2, 4))
-        x = tf.reshape(x, shape=(b * self.headers, h, w, self.single_header_inplanes))
-
-        # Compute shorcut
-        shortcut = x
-        # B*h, 1, H*W, C/h
-        shortcut = tf.reshape(shortcut, shape=(b * self.headers, 1, h * w, self.single_header_inplanes))
-        # B*h, 1, C/h, H*W
-        shortcut = tf.transpose(shortcut, perm=[0, 1, 3, 2])
-
-        # Compute context mask
-        # B*h, H, W, 1,
-        context_mask = self.conv_mask(x)
-        # B*h, 1, H*W, 1
-        context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1))
-        # scale variance
-        if self.att_scale and self.headers > 1:
-            context_mask = context_mask / tf.sqrt(self.single_header_inplanes)
-        # B*h, 1, H*W, 1
-        context_mask = tf.keras.activations.softmax(context_mask, axis=2)
-
-        # Compute context
-        # B*h, 1, C/h, 1
-        context = tf.matmul(shortcut, context_mask)
-        context = tf.reshape(context, shape=(b, 1, c, 1))
-        # B, 1, 1, C
-        context = tf.transpose(context, perm=(0, 1, 3, 2))
-        # Set shape to resolve shape when calling this module in the Sequential MAGCResnet
-        batch, chan = inputs.get_shape().as_list()[0], inputs.get_shape().as_list()[-1]
-        context.set_shape([batch, 1, 1, chan])
-        return context
-
-    def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
-        # Context modeling: B, H, W, C  ->  B, 1, 1, C
-        context = self.context_modeling(inputs)
-        # Transform: B, 1, 1, C  ->  B, 1, 1, C
-        transformed = self.transform(context)
-        return inputs + transformed
-
-
-class MAGCResnet(Sequential):
-
-    """Implements the modified resnet with MAGC layers, as described in paper.
-
-    Args:
-        headers: number of header to split channels in MAGC layers
-        input_shape: shape of the model input (without batch dim)
-    """
-
-    def __init__(
-        self,
-        headers: int = 1,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
-    ) -> None:
-        _layers = [
-            # conv_1x
-            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
-            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_2x
-            ResnetStage(num_blocks=1, output_channels=256),
-            MAGC(inplanes=256, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=256, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_3x
-            ResnetStage(num_blocks=2, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 1), (2, 1)),
-            # conv_4x
-            ResnetStage(num_blocks=5, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            # conv_5x
-            ResnetStage(num_blocks=3, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-        ]
-        super().__init__(_layers)
-
-
 class MASTER(_MASTER, Model):
-
     """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
     Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_.
 
     Args:
+    ----
+        feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary, (without EOS, SOS, PAD)
         d_model: d parameter for the transformer decoder
-        headers: headers for the MAGC module
         dff: depth of the pointwise feed-forward layer
         num_heads: number of heads for the mutli-head attention module
         num_layers: number of decoder layers to stack
         max_length: maximum length of character sequence handled by the model
-        input_size: size of the image inputs
+        dropout: dropout probability of the decoder
+        input_shape: size of the image inputs
+        exportable: onnx exportable returns only logits
+        cfg: dictionary containing information about the model
     """
 
     def __init__(
         self,
+        feature_extractor: Model,
         vocab: str,
         d_model: int = 512,
-        headers: int = 1,
         dff: int = 2048,
-        num_heads: int = 8,
+        num_heads: int = 8,  # number of heads in the transformer decoder
         num_layers: int = 3,
         max_length: int = 50,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
+        dropout: float = 0.2,
+        input_shape: Tuple[int, int, int] = (32, 128, 3),  # different from the paper
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__()
 
-        self.vocab = vocab
+        self.exportable = exportable
         self.max_length = max_length
+        self.d_model = d_model
+        self.vocab = vocab
         self.cfg = cfg
         self.vocab_size = len(vocab)
 
-        self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape)
-        self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model)  # 3 more classes: EOS/PAD/SOS
+        self.feat_extractor = feature_extractor
+        self.positional_encoding = PositionalEncoding(self.d_model, dropout, max_len=input_shape[0] * input_shape[1])
 
         self.decoder = Decoder(
             num_layers=num_layers,
-            d_model=d_model,
+            d_model=self.d_model,
             num_heads=num_heads,
+            vocab_size=self.vocab_size + 3,  # EOS, SOS, PAD
             dff=dff,
-            vocab_size=self.vocab_size,
-            maximum_position_encoding=max_length,
+            dropout=dropout,
+            maximum_position_encoding=self.max_length,
         )
-        self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model)
-        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
 
+        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
         self.postprocessor = MASTERPostProcessor(vocab=self.vocab)
 
     @tf.function
-    def make_mask(self, target: tf.Tensor) -> tf.Tensor:
-        look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
-        target_padding_mask = create_padding_mask(target, self.vocab_size + 2)  # Pad symbol
-        combined_mask = tf.maximum(target_padding_mask, look_ahead_mask)
-        return combined_mask
+    def make_source_and_target_mask(self, source: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+        # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
+        # (N, 1, 1, max_length)
+        target_pad_mask = tf.cast(tf.math.not_equal(target, self.vocab_size + 2), dtype=tf.uint8)
+        target_pad_mask = target_pad_mask[:, tf.newaxis, tf.newaxis, :]
+        target_length = target.shape[1]
+        # sub mask filled diagonal with 1 = see 0 = masked (max_length, max_length)
+        target_sub_mask = tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
+        # source mask filled with ones (max_length, positional_encoded_seq_len)
+        source_mask = tf.ones((target_length, source.shape[1]))
+        # combine the two masks into one boolean mask where False is masked (N, 1, max_length, max_length)
+        target_mask = tf.math.logical_and(
+            tf.cast(target_sub_mask, dtype=tf.bool), tf.cast(target_pad_mask, dtype=tf.bool)
+        )
+        return source_mask, target_mask
 
+    @staticmethod
     def compute_loss(
-        self,
         model_output: tf.Tensor,
         gt: tf.Tensor,
         seq_len: List[int],
@@ -512,11 +413,13 @@ 

Source code for doctr.models.recognition.master.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -532,7 +435,7 @@

Source code for doctr.models.recognition.master.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) @@ -547,94 +450,103 @@

Source code for doctr.models.recognition.master.tensorflow

"""Call function for training Args: + ---- x: images target: list of str labels return_model_output: if True, return logits return_preds: if True, decode logits + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A dictionnary containing eventually loss, logits and predictions. """ - # Encode - feature = self.feature_extractor(x, **kwargs) - b, h, w, c = (tf.shape(feature)[i] for i in range(4)) + feature = self.feat_extractor(x, **kwargs) + b, h, w, c = feature.get_shape() + # (N, H, W, C) --> (N, H * W, C) feature = tf.reshape(feature, shape=(b, h * w, c)) - encoded = feature + self.feature_pe[:, :h * w, :] + # add positional encoding to features + encoded = self.positional_encoding(feature, **kwargs) out: Dict[str, tf.Tensor] = {} + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") + if target is not None: # Compute target: tensor of gts and sequence lengths - gt, seq_len = self.compute_target(target) - - if kwargs.get('training', False): - if target is None: - raise AssertionError("In training mode, you need to pass a value to 'target'") - tgt_mask = self.make_mask(gt) + gt, seq_len = self.build_target(target) + # Compute decoder masks + source_mask, target_mask = self.make_source_and_target_mask(encoded, gt) # Compute logits - output = self.decoder(gt, encoded, tgt_mask, None, **kwargs) + output = self.decoder(gt, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) - else: - # When not training, we want to compute logits in with the decoder, although - # we have access to gts (we need gts to compute the loss, but not in the decoder) logits = self.decode(encoded, **kwargs) + logits = _bf16_to_float32(logits) + + if self.exportable: + out["logits"] = logits + return out + if target is not None: - out['loss'] = self.compute_loss(logits, gt, seq_len) + out["loss"] = self.compute_loss(logits, gt, seq_len) if return_model_output: - out['out_map'] = logits + out["out_map"] = logits if return_preds: - predictions = self.postprocessor(logits) - out['preds'] = predictions + out["preds"] = self.postprocessor(logits) return out + @tf.function def decode(self, encoded: tf.Tensor, **kwargs: Any) -> tf.Tensor: """Decode function for prediction Args: + ---- encoded: encoded features + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A Tuple of tf.Tensor: predictions, logits """ - b = tf.shape(encoded)[0] - max_len = tf.constant(self.max_length, dtype=tf.int32) + b = encoded.shape[0] + start_symbol = tf.constant(self.vocab_size + 1, dtype=tf.int32) # SOS padding_symbol = tf.constant(self.vocab_size + 2, dtype=tf.int32) # PAD - ys = tf.fill(dims=(b, max_len - 1), value=padding_symbol) + ys = tf.fill(dims=(b, self.max_length - 1), value=padding_symbol) start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols - # max_len = len + 2 (sos + eos) + # Final dimension include EOS/SOS/PAD for i in range(self.max_length - 1): - ys_mask = self.make_mask(ys) - output = self.decoder(ys, encoded, ys_mask, None, **kwargs) + source_mask, target_mask = self.make_source_and_target_mask(encoded, ys) + output = self.decoder(ys, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) prob = tf.nn.softmax(logits, axis=-1) - next_word = tf.argmax(prob, axis=-1, output_type=ys.dtype) - # ys.shape = B, T - i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(max_len), indexing='ij') + next_token = tf.argmax(prob, axis=-1, output_type=ys.dtype) + # update ys with the next token and ignore the first token (SOS) + i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(self.max_length), indexing="ij") indices = tf.stack([i_mesh[:, i + 1], j_mesh[:, i + 1]], axis=1) - ys = tf.tensor_scatter_nd_update(ys, indices, next_word[:, i + 1]) + ys = tf.tensor_scatter_nd_update(ys, indices, next_token[:, i]) - # final_logits of shape (N, max_length - 1, vocab_size + 1) (whithout sos) + # Shape (N, max_length, vocab_size + 1) return logits class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures + Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -649,51 +561,66 @@

Source code for doctr.models.recognition.master.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _master(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> MASTER: +def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool = True, **kwargs: Any) -> MASTER: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"]) + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) - kwargs['vocab'] = _cfg['vocab'] + kwargs["vocab"] = _cfg["vocab"] + kwargs["input_shape"] = _cfg["input_shape"] # Build the model - model = MASTER(cfg=_cfg, **kwargs) + model = MASTER( + backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg["input_shape"], include_top=False), + cfg=_cfg, + **kwargs, + ) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model
-[docs] +[docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import master - >>> model = master(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + + >>> import tensorflow as tf + >>> from doctr.models import master + >>> model = master(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keywoard arguments passed to the MASTER architecture + Returns: + ------- text recognition architecture """ - - return _master('master', pretrained, **kwargs)
+ return _master("master", pretrained, magc_resnet31, **kwargs)
@@ -727,8 +654,8 @@

Source code for doctr.models.recognition.master.tensorflow

- +
+ diff --git a/v0.4.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.4.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/v0.4.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.4.0/_modules/doctr/models/recognition/sar.html b/v0.4.0/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.4.0/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.4.0/_modules/doctr/models/recognition/sar/tensorflow.html index e514e4f0c4..4a591e6451 100644 --- a/v0.4.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.sar.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
+
 import tensorflow as tf
-from tensorflow.keras import Sequential, layers, Model
-from typing import Tuple, Dict, List, Any, Optional
+from tensorflow.keras import Model, Sequential, layers
 
-from ... import backbones
-from ...utils import load_pretrained_params
-from ..core import RecognitionModel, RecognitionPostProcessor
+from doctr.datasets import VOCABS
 from doctr.utils.repr import NestedObject
 
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
+from ...classification import resnet31
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from ..core import RecognitionModel, RecognitionPostProcessor
+
+__all__ = ["SAR", "sar_resnet31"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
+    "sar_resnet31": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/sar_resnet31-5a58806c.weights.h5&src=0",
     },
 }
 
 
+class SAREncoder(layers.Layer, NestedObject):
+    """Implements encoder module of the SAR model
+
+    Args:
+    ----
+        rnn_units: number of hidden rnn units
+        dropout_prob: dropout probability
+    """
+
+    def __init__(self, rnn_units: int, dropout_prob: float = 0.0) -> None:
+        super().__init__()
+        self.rnn = Sequential([
+            layers.LSTM(units=rnn_units, return_sequences=True, recurrent_dropout=dropout_prob),
+            layers.LSTM(units=rnn_units, return_sequences=False, recurrent_dropout=dropout_prob),
+        ])
+
+    def call(
+        self,
+        x: tf.Tensor,
+        **kwargs: Any,
+    ) -> tf.Tensor:
+        # (N, C)
+        return self.rnn(x, **kwargs)
+
+
 class AttentionModule(layers.Layer, NestedObject):
     """Implements attention module of the SAR model
 
     Args:
+    ----
         attention_units: number of hidden attention units
 
     """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
 
+    def __init__(self, attention_units: int) -> None:
         super().__init__()
         self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            3,
+            strides=1,
+            use_bias=True,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
+            1,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.flatten = layers.Flatten()
 
@@ -343,12 +395,12 @@ 

Source code for doctr.models.recognition.sar.tensorflow

hidden_state: tf.Tensor, **kwargs: Any, ) -> tf.Tensor: - [H, W] = features.get_shape().as_list()[1:3] - # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) - hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) # shape (N, H, W, vgg_units) -> (N, H, W, attention_units) features_projection = self.features_projector(features, **kwargs) + # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) + hidden_state = tf.expand_dims(tf.expand_dims(hidden_state, axis=1), axis=1) + hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) projection = tf.math.tanh(hidden_state_projection + features_projection) # shape (N, H, W, attention_units) -> (N, H, W, 1) attention = self.attention_projector(projection, **kwargs) @@ -358,23 +410,25 @@

Source code for doctr.models.recognition.sar.tensorflow

# shape (N, H * W) -> (N, H, W, 1) attention_map = tf.reshape(attention, [-1, H, W, 1]) glimpse = tf.math.multiply(features, attention_map) - # shape (N, H * W) -> (N, 1) - glimpse = tf.reduce_sum(glimpse, axis=[1, 2]) - return glimpse + # shape (N, H * W) -> (N, C) + return tf.reduce_sum(glimpse, axis=[1, 2]) class SARDecoder(layers.Layer, NestedObject): """Implements decoder module of the SAR model Args: + ---- rnn_units: number of hidden units in recurrent cells max_length: maximum length of a sequence vocab_size: number of classes in the model alphabet embedding_units: number of hidden embedding units attention_units: number of hidden attention units - num_decoder_layers: number of LSTM layers to stack + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability """ + def __init__( self, rnn_units: int, @@ -382,23 +436,22 @@

Source code for doctr.models.recognition.sar.tensorflow

vocab_size: int, embedding_units: int, attention_units: int, - num_decoder_layers: int = 2, - input_shape: Optional[List[Tuple[Optional[int]]]] = None, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, ) -> None: - super().__init__() self.vocab_size = vocab_size - self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] - ) - self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) - self.attention_module = AttentionModule(attention_units) - self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units)) self.max_length = max_length - # Initialize kernels - if input_shape is not None: - self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units))) + self.embed = layers.Dense(embedding_units, use_bias=False) + self.embed_tgt = layers.Embedding(embedding_units, self.vocab_size + 1) + + self.lstm_cells = layers.StackedRNNCells([ + layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_cells) + ]) + self.attention_module = AttentionModule(attention_units) + self.output_dense = layers.Dense(self.vocab_size + 1, use_bias=True) + self.dropout = layers.Dropout(dropout_prob) def call( self, @@ -407,40 +460,47 @@

Source code for doctr.models.recognition.sar.tensorflow

gt: Optional[tf.Tensor] = None, **kwargs: Any, ) -> tf.Tensor: - - # initialize states (each of shape (N, rnn_units)) - states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=tf.float32 - ) - # run first step of lstm - # holistic: shape (N, rnn_units) - _, states = self.lstm_decoder(holistic, states, **kwargs) - # Initialize with the index of virtual START symbol (placed after <eos>) - symbol = tf.fill(features.shape[0], self.vocab_size + 1) - logits_list = [] - if kwargs.get('training') and gt is None: - raise ValueError('Need to provide labels during training for teacher forcing') - for t in range(self.max_length + 1): # keep 1 step for <eos> - # one-hot symbol with depth vocab_size + 1 - # embeded_symbol: shape (N, embedding_units) - embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs) - logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs) - glimpse = self.attention_module( - features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs, - ) - # logits: shape (N, rnn_units), glimpse: shape (N, 1) - logits = tf.concat([logits, glimpse], axis=-1) - # shape (N, rnn_units + 1) -> (N, vocab_size + 1) - logits = self.output_dense(logits, **kwargs) - # update symbol with predicted logits for t+1 step - if kwargs.get('training'): - symbol = gt[:, t] # type: ignore[index] + if gt is not None: + gt_embedding = self.embed_tgt(gt, **kwargs) + + logits_list: List[tf.Tensor] = [] + + for t in range(self.max_length + 1): # 32 + if t == 0: + # step to init the first states of the LSTMCell + states = self.lstm_cells.get_initial_state( + inputs=None, batch_size=features.shape[0], dtype=features.dtype + ) + prev_symbol = holistic + elif t == 1: + # step to init a 'blank' sequence of length vocab_size + 1 filled with zeros + # (N, vocab_size + 1) --> (N, embedding_units) + prev_symbol = tf.zeros([features.shape[0], self.vocab_size + 1], dtype=features.dtype) + prev_symbol = self.embed(prev_symbol, **kwargs) else: - symbol = tf.argmax(logits, axis=-1) - logits_list.append(logits) - outputs = tf.stack(logits_list, axis=1) # shape (N, max_length + 1, vocab_size + 1) - - return outputs + if gt is not None and kwargs.get("training", False): + # (N, embedding_units) -2 because of <bos> and <eos> (same) + prev_symbol = self.embed(gt_embedding[:, t - 2], **kwargs) + else: + # -1 to start at timestep where prev_symbol was initialized + index = tf.argmax(logits_list[t - 1], axis=-1) + # update prev_symbol with ones at the index of the previous logit vector + prev_symbol = self.embed(self.embed_tgt(index, **kwargs), **kwargs) + + # (N, C), (N, C) take the last hidden state and cell state from current timestep + _, states = self.lstm_cells(prev_symbol, states, **kwargs) + # states = (hidden_state, cell_state) + hidden_state = states[0][0] + # (N, H, W, C), (N, C) --> (N, C) + glimpse = self.attention_module(features, hidden_state, **kwargs) + # (N, C), (N, C) --> (N, 2 * C) + logits = tf.concat([hidden_state, glimpse], axis=1) + logits = self.dropout(logits, **kwargs) + # (N, vocab_size + 1) + logits_list.append(self.output_dense(logits, **kwargs)) + + # (max_length + 1, N, vocab_size + 1) --> (N, max_length + 1, vocab_size + 1) + return tf.transpose(tf.stack(logits_list[1:]), (1, 0, 2)) class SAR(Model, RecognitionModel): @@ -448,17 +508,20 @@

Source code for doctr.models.recognition.sar.tensorflow

Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of hidden units in both encoder and decoder LSTM embedding_units: number of embedding units attention_units: number of hidden units in attention module max_length: maximum word length handled by the model - num_decoders: number of LSTM to stack in decoder layer - + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability for the encoder and decoder + exportable: onnx exportable returns only logits + cfg: dictionary containing information about the model """ - _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "encoder", "decoder", "postprocessor"] def __init__( self, @@ -468,36 +531,34 @@

Source code for doctr.models.recognition.sar.tensorflow

embedding_units: int = 512, attention_units: int = 512, max_length: int = 30, - num_decoders: int = 2, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: - super().__init__() self.vocab = vocab + self.exportable = exportable self.cfg = cfg - self.max_length = max_length + 1 # Add 1 timestep for EOS after the longest word self.feat_extractor = feature_extractor - self.encoder = Sequential( - [ - layers.LSTM(units=rnn_units, return_sequences=True), - layers.LSTM(units=rnn_units, return_sequences=False) - ] - ) - # Initialize the kernels (watch out for reduce_max) - self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:]) - + self.encoder = SAREncoder(rnn_units, dropout_prob) self.decoder = SARDecoder( - rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders, - input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape] + rnn_units, + self.max_length, + len(vocab), + embedding_units, + attention_units, + num_decoder_cells, + dropout_prob, ) self.postprocessor = SARPostProcessor(vocab=vocab) + @staticmethod def compute_loss( - self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: tf.Tensor, @@ -506,11 +567,13 @@

Source code for doctr.models.recognition.sar.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -525,7 +588,7 @@

Source code for doctr.models.recognition.sar.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -536,16 +599,28 @@

Source code for doctr.models.recognition.sar.tensorflow

return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - features = self.feat_extractor(x, **kwargs) - pooled_features = tf.reduce_max(features, axis=1) # vertical max pooling + # vertical max pooling --> (N, C, W) + pooled_features = tf.reduce_max(features, axis=1) + # holistic (N, C) encoded = self.encoder(pooled_features, **kwargs) + if target is not None: - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) seq_len = tf.cast(seq_len, tf.int32) - decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training for teacher forcing") + + decoded_features = _bf16_to_float32( + self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + ) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = decoded_features + return out + if return_model_output: out["out_map"] = decoded_features @@ -554,7 +629,7 @@

Source code for doctr.models.recognition.sar.tensorflow

out["preds"] = self.postprocessor(decoded_features) if target is not None: - out['loss'] = self.compute_loss(decoded_features, gt, seq_len) + out["loss"] = self.compute_loss(decoded_features, gt, seq_len) return out @@ -563,9 +638,8 @@

Source code for doctr.models.recognition.sar.tensorflow

"""Post processor for SAR architectures Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -580,95 +654,75 @@

Source code for doctr.models.recognition.sar.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: +def _sar( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> SAR: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) - _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units']) - _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units']) - _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length']) - _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) # Feature extractor - feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + pretrained=pretrained_backbone, + input_shape=_cfg["input_shape"], include_top=False, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - kwargs['embedding_units'] = _cfg['embedding_units'] - kwargs['attention_units'] = _cfg['attention_units'] - kwargs['max_length'] = _cfg['max_length'] - kwargs['num_decoders'] = _cfg['num_decoders'] + kwargs["vocab"] = _cfg["vocab"] # Build the model model = SAR(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model -
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - -
-[docs] +[docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import sar_resnet31 + >>> model = sar_resnet31(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the SAR architecture Returns: + ------- text recognition architecture """ - - return _sar('sar_resnet31', pretrained, **kwargs)
+ return _sar("sar_resnet31", pretrained, resnet31, **kwargs)
@@ -702,8 +756,8 @@

Source code for doctr.models.recognition.sar.tensorflow

- +
+ diff --git a/v0.4.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.4.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/v0.4.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.4.0/_modules/doctr/models/recognition/zoo.html b/v0.4.0/_modules/doctr/models/recognition/zoo.html index bf0ae6af6e..f664304019 100644 --- a/v0.4.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.4.0/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
-from doctr.file_utils import is_tf_available, is_torch_available
-from .core import RecognitionPredictor
-from ..preprocessor import PreProcessor
-from .. import recognition
+from doctr.file_utils import is_tf_available
+from doctr.models.preprocessor import PreProcessor
 
+from .. import recognition
+from .predictor import RecognitionPredictor
 
 __all__ = ["recognition_predictor"]
 
 
-if is_tf_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
-elif is_torch_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
+ARCHS: List[str] = [
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "sar_resnet31",
+    "master",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
+]
+
 
+def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+        _model = recognition.__dict__[arch](
+            pretrained=pretrained, pretrained_backbone=kwargs.get("pretrained_backbone", True)
+        )
+    else:
+        if not isinstance(
+            arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq)
+        ):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+        _model = arch
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+    kwargs.pop("pretrained_backbone", None)
 
-    _model = recognition.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 32)
-    predictor = RecognitionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
-        _model
-    )
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 128)
+    input_shape = _model.cfg["input_shape"][:2] if is_tf_available() else _model.cfg["input_shape"][-2:]
+    predictor = RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs), _model)
 
     return predictor
 
 
 
-[docs] -def recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor: +[docs] +def recognition_predictor( + arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + symmetric_pad: bool = False, + batch_size: int = 128, + **kwargs: Any, +) -> RecognitionPredictor: """Text recognition architecture. Example:: @@ -326,14 +369,18 @@

Source code for doctr.models.recognition.zoo

        >>> out = model([input_page])
 
     Args:
-        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
+    ----
+        arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
         pretrained: If True, returns a model pre-trained on our text recognition dataset
+        symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right
+        batch_size: number of samples the model processes in parallel
+        **kwargs: optional parameters to be passed to the architecture
 
     Returns:
+    -------
         Recognition predictor
     """
-
-    return _predictor(arch, pretrained, **kwargs)
+ return _predictor(arch=arch, pretrained=pretrained, symmetric_pad=symmetric_pad, batch_size=batch_size, **kwargs)
@@ -367,8 +414,8 @@

Source code for doctr.models.recognition.zoo

   
-
- +
+ diff --git a/v0.4.0/_modules/doctr/models/zoo.html b/v0.4.0/_modules/doctr/models/zoo.html index dec6857019..d459671648 100644 --- a/v0.4.0/_modules/doctr/models/zoo.html +++ b/v0.4.0/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.models.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from typing import Any
-from .core import OCRPredictor
+
 from .detection.zoo import detection_predictor
+from .kie_predictor import KIEPredictor
+from .predictor import OCRPredictor
 from .recognition.zoo import recognition_predictor
 
+__all__ = ["ocr_predictor", "kie_predictor"]
 
-__all__ = ["ocr_predictor"]
-
-
-def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
 
+def _predictor(
+    det_arch: Any,
+    reco_arch: Any,
+    pretrained: bool,
+    pretrained_backbone: bool = True,
+    assume_straight_pages: bool = True,
+    preserve_aspect_ratio: bool = True,
+    symmetric_pad: bool = True,
+    det_bs: int = 2,
+    reco_bs: int = 128,
+    detect_orientation: bool = False,
+    straighten_pages: bool = False,
+    detect_language: bool = False,
+    **kwargs,
+) -> OCRPredictor:
     # Detection
-    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
+    det_predictor = detection_predictor(
+        det_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=det_bs,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+    )
 
     # Recognition
-    reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
+    reco_predictor = recognition_predictor(
+        reco_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=reco_bs,
+    )
 
-    return OCRPredictor(det_predictor, reco_predictor)
+    return OCRPredictor(
+        det_predictor,
+        reco_predictor,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+        detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
+        detect_language=detect_language,
+        **kwargs,
+    )
 
 
 
-[docs] +[docs] def ocr_predictor( - det_arch: str = 'db_resnet50', - reco_arch: str = 'crnn_vgg16_bn', + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", pretrained: bool = False, - **kwargs: Any + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. - Example:: - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet') + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` Returns: + ------- OCR predictor """ + return _predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
+ + - return _predictor(det_arch, reco_arch, pretrained, **kwargs)
+def _kie_predictor( + det_arch: Any, + reco_arch: Any, + pretrained: bool, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + det_bs: int = 2, + reco_bs: int = 128, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs, +) -> KIEPredictor: + # Detection + det_predictor = detection_predictor( + det_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=det_bs, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + ) + + # Recognition + reco_predictor = recognition_predictor( + reco_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=reco_bs, + ) + + return KIEPredictor( + det_predictor, + reco_predictor, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + ) + + +
+[docs] +def kie_predictor( + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, +) -> KIEPredictor: + """End-to-end KIE architecture using one model for localization, and another for text recognition. + + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) + + Args: + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') + pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` + + Returns: + ------- + KIE predictor + """ + return _kie_predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
@@ -353,8 +575,8 @@

Source code for doctr.models.zoo

       
     
   
- - + + diff --git a/v0.4.0/_modules/doctr/transforms/modules.html b/v0.4.0/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.4.0/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/transforms/modules/base.html b/v0.4.0/_modules/doctr/transforms/modules/base.html index c42079a8fd..4596df3848 100644 --- a/v0.4.0/_modules/doctr/transforms/modules/base.html +++ b/v0.4.0/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.base

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
+import math
 import random
-from typing import List, Any, Callable
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import numpy as np
 
 from doctr.utils.repr import NestedObject
+
 from .. import functional as F
 
+__all__ = ["SampleCompose", "ImageTransform", "ColorInversion", "OneOf", "RandomApply", "RandomRotate", "RandomCrop"]
+
+
+class SampleCompose(NestedObject):
+    """Implements a wrapper that will apply transformations sequentially on both image and target
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfo = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), np.zeros((2, 4)))
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import torch
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfos = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfos(torch.rand(8, 64, 64, 3), np.zeros((2, 4)))
+
+    Args:
+    ----
+        transforms: list of transformation modules
+    """
+
+    _children_names: List[str] = ["sample_transforms"]
+
+    def __init__(self, transforms: List[Callable[[Any, Any], Tuple[Any, Any]]]) -> None:
+        self.sample_transforms = transforms
+
+    def __call__(self, x: Any, target: Any) -> Tuple[Any, Any]:
+        for t in self.sample_transforms:
+            x, target = t(x, target)
+
+        return x, target
+
+
+class ImageTransform(NestedObject):
+    """Implements a transform wrapper to turn an image-only transformation into an image+target transform
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), None)
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import torch
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(torch.rand(8, 64, 64, 3), None)
+
+    Args:
+    ----
+        transform: the image transformation module to wrap
+    """
+
+    _children_names: List[str] = ["img_transform"]
+
+    def __init__(self, transform: Callable[[Any], Any]) -> None:
+        self.img_transform = transform
 
-__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
+    def __call__(self, img: Any, target: Any) -> Tuple[Any, Any]:
+        img = self.img_transform(img)
+        return img, target
 
 
 
-[docs] +[docs] class ColorInversion(NestedObject): """Applies the following tranformation to a tensor (image or batch of images): convert to grayscale, colorize (shift 0-values randomly), and then invert colors - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(torch.rand(8, 64, 64, 3)) Args: + ---- min_val: range [min_val, 1] to colorize RGB pixels """ + def __init__(self, min_val: float = 0.5) -> None: self.min_val = min_val @@ -316,59 +437,178 @@

Source code for doctr.transforms.modules.base

-[docs] +[docs] class OneOf(NestedObject): """Randomly apply one of the input transformations - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transforms: list of transformations, one only will be picked """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: # Pick transformation transfo = self.transforms[int(random.random() * len(self.transforms))] # Apply - return transfo(img)
+ return transfo(img) if target is None else transfo(img, target) # type: ignore[call-arg]
-[docs] +[docs] class RandomApply(NestedObject): """Apply with a probability p the input transformation - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transform: transformation to apply p: probability to apply """ - def __init__(self, transform: Callable[[Any], Any], p: float = .5) -> None: + + def __init__(self, transform: Callable[[Any], Any], p: float = 0.5) -> None: self.transform = transform self.p = p def extra_repr(self) -> str: return f"transform={self.transform}, p={self.p}" - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: if random.random() < self.p: - return self.transform(img) - return img
+ return self.transform(img) if target is None else self.transform(img, target) # type: ignore[call-arg] + return img if target is None else (img, target)
+ + + +
+[docs] +class RandomRotate(NestedObject): + """Randomly rotate a tensor image and its boxes + + .. image:: https://doctr-static.mindee.com/models?id=v0.4.0/rotation_illustration.png&src=0 + :align: center + + Args: + ---- + max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in + [-max_angle, max_angle] + expand: whether the image should be padded before the rotation + """ + + def __init__(self, max_angle: float = 5.0, expand: bool = False) -> None: + self.max_angle = max_angle + self.expand = expand + + def extra_repr(self) -> str: + return f"max_angle={self.max_angle}, expand={self.expand}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + angle = random.uniform(-self.max_angle, self.max_angle) + r_img, r_polys = F.rotate_sample(img, target, angle, self.expand) + # Removes deleted boxes + is_kept = (r_polys.max(1) > r_polys.min(1)).sum(1) == 2 + return r_img, r_polys[is_kept]
+ + + +
+[docs] +class RandomCrop(NestedObject): + """Randomly crop a tensor image and its boxes + + Args: + ---- + scale: tuple of floats, relative (min_area, max_area) of the crop + ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w + """ + + def __init__(self, scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: + self.scale = scale + self.ratio = ratio + + def extra_repr(self) -> str: + return f"scale={self.scale}, ratio={self.ratio}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + scale = random.uniform(self.scale[0], self.scale[1]) + ratio = random.uniform(self.ratio[0], self.ratio[1]) + + height, width = img.shape[:2] + + # Calculate crop size + crop_area = scale * width * height + aspect_ratio = ratio * (width / height) + crop_width = int(round(math.sqrt(crop_area * aspect_ratio))) + crop_height = int(round(math.sqrt(crop_area / aspect_ratio))) + + # Ensure crop size does not exceed image dimensions + crop_width = min(crop_width, width) + crop_height = min(crop_height, height) + + # Randomly select crop position + x = random.randint(0, width - crop_width) + y = random.randint(0, height - crop_height) + + # relative crop box + crop_box = (x / width, y / height, (x + crop_width) / width, (y + crop_height) / height) + if target.shape[1:] == (4, 2): + min_xy = np.min(target, axis=1) + max_xy = np.max(target, axis=1) + _target = np.concatenate((min_xy, max_xy), axis=1) + else: + _target = target + + # Crop image and targets + croped_img, crop_boxes = F.crop_detection(img, _target, crop_box) + # hard fallback if no box is kept + if crop_boxes.shape[0] == 0: + return img, target + # clip boxes + return croped_img, np.clip(crop_boxes, 0, 1)
@@ -402,8 +642,8 @@

Source code for doctr.transforms.modules.base

- - + + diff --git a/v0.4.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.4.0/_modules/doctr/transforms/modules/tensorflow.html index 1d192a876b..acbbe96225 100644 --- a/v0.4.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.4.0/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import random
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
 import tensorflow as tf
-from typing import List, Any, Tuple, Callable
 
 from doctr.utils.repr import NestedObject
 
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'RandomBrightness',
-           'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality']
+from ..functional.tensorflow import _gaussian_filter, random_shadow
+
+__all__ = [
+    "Compose",
+    "Resize",
+    "Normalize",
+    "LambdaTransformation",
+    "ToGray",
+    "RandomBrightness",
+    "RandomContrast",
+    "RandomSaturation",
+    "RandomHue",
+    "RandomGamma",
+    "RandomJpegQuality",
+    "GaussianBlur",
+    "ChannelShuffle",
+    "GaussianNoise",
+    "RandomHorizontalFlip",
+    "RandomShadow",
+    "RandomResize",
+]
 
 
 
-[docs] +[docs] class Compose(NestedObject): """Implements a wrapper that will apply transformations sequentially - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Compose, Resize + >>> transfos = Compose([Resize((32, 32))]) + >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- transforms: list of transformation modules """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms @@ -319,26 +361,27 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class Resize(NestedObject): """Resizes a tensor to a target size - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Resize + >>> transfo = Resize((32, 32)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- output_size: expected output size method: interpolation method preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically """ + def __init__( self, - output_size: Tuple[int, int], - method: str = 'bilinear', + output_size: Union[int, Tuple[int, int]], + method: str = "bilinear", preserve_aspect_ratio: bool = False, symmetric_pad: bool = False, ) -> None: @@ -346,6 +389,14 @@

Source code for doctr.transforms.modules.tensorflow

self.method = method self.preserve_aspect_ratio = preserve_aspect_ratio self.symmetric_pad = symmetric_pad + self.antialias = True + + if isinstance(self.output_size, int): + self.wanted_size = (self.output_size, self.output_size) + elif isinstance(self.output_size, (tuple, list)): + self.wanted_size = self.output_size + else: + raise AssertionError("Output size should be either a list, a tuple or an int") def extra_repr(self) -> str: _repr = f"output_size={self.output_size}, method='{self.method}'" @@ -353,64 +404,106 @@

Source code for doctr.transforms.modules.tensorflow

_repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" return _repr - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) + def __call__( + self, + img: tf.Tensor, + target: Optional[np.ndarray] = None, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: + input_dtype = img.dtype + self.output_size = ( + (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size + ) + + img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias) + # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio + raw_shape = img.shape[:2] + if self.symmetric_pad: + half_pad = (int((self.output_size[0] - img.shape[0]) / 2), 0) if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
+ if isinstance(self.output_size, (tuple, list)): + # In that case we need to pad because we want to enforce both width and height + if not self.symmetric_pad: + half_pad = (0, 0) + elif self.output_size[0] == img.shape[0]: + half_pad = (0, int((self.output_size[1] - img.shape[1]) / 2)) + # Pad image + img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size) + + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) + if target is not None: + if self.symmetric_pad: + offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1] + + if self.preserve_aspect_ratio: + # Get absolute coords + if target.shape[1:] == (4,): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1] + target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0] + else: + target[:, [0, 2]] *= raw_shape[1] / img.shape[1] + target[:, [1, 3]] *= raw_shape[0] / img.shape[0] + elif target.shape[1:] == (4, 2): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1] + target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0] + else: + target[..., 0] *= raw_shape[1] / img.shape[1] + target[..., 1] *= raw_shape[0] / img.shape[0] + else: + raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") + + return tf.cast(img, dtype=input_dtype), np.clip(target, 0, 1) + + return tf.cast(img, dtype=input_dtype)
-[docs] +[docs] class Normalize(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Normalize + >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- mean: average value per channel std: standard deviation per channel """ + def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) + self.mean = tf.constant(mean) + self.std = tf.constant(std) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std + img -= tf.cast(self.mean, dtype=img.dtype) + img /= tf.cast(self.std, dtype=img.dtype) return img
-[docs] +[docs] class LambdaTransformation(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import LambdaTransformation + >>> transfo = LambdaTransformation(lambda x: x/ 255.) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- fn: the function to be applied to the input tensor """ + def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: self.fn = fn @@ -420,37 +513,42 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class ToGray(NestedObject): """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import ToGray + >>> transfo = ToGray() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) """ + + def __init__(self, num_output_channels: int = 1): + self.num_output_channels = num_output_channels + def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
+ img = tf.image.rgb_to_grayscale(img) + return img if self.num_output_channels == 1 else tf.repeat(img, self.num_output_channels, axis=-1)
-[docs] +[docs] class RandomBrightness(NestedObject): """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta to all pixels - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomBrightness + >>> transfo = RandomBrightness() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] p: probability to apply transformation """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -463,21 +561,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomContrast(NestedObject): """Randomly adjust contrast of a tensor (batch of images or image) by adjusting each pixel: (img - mean) * contrast_factor + mean. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomContrast + >>> transfo = RandomContrast() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) """ - def __init__(self, delta: float = .3) -> None: + + def __init__(self, delta: float = 0.3) -> None: self.delta = delta def extra_repr(self) -> str: @@ -489,21 +588,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomSaturation(NestedObject): """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and increasing saturation by a factor. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomSaturation + >>> transfo = RandomSaturation() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) """ - def __init__(self, delta: float = .5) -> None: + + def __init__(self, delta: float = 0.5) -> None: self.delta = delta def extra_repr(self) -> str: @@ -515,19 +615,20 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomHue(NestedObject): """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHue + >>> transfo = RandomHue() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -540,22 +641,23 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomGamma(NestedObject): """randomly performs gamma correction for a tensor (batch of images or image) - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomGamma + >>> transfo = RandomGamma() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- min_gamma: non-negative real number, lower bound for gamma param max_gamma: non-negative real number, upper bound for gamma min_gain: lower bound for constant multiplier max_gain: upper bound for constant multiplier """ + def __init__( self, min_gamma: float = 0.5, @@ -580,20 +682,21 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomJpegQuality(NestedObject): """Randomly adjust jpeg quality of a 3 dimensional RGB image - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomJpegQuality + >>> transfo = RandomJpegQuality() + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- min_quality: int between [0, 100] max_quality: int between [0, 100] """ + def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: self.min_quality = min_quality self.max_quality = max_quality @@ -602,10 +705,224 @@

Source code for doctr.transforms.modules.tensorflow

return f"min_quality={self.min_quality}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality + return tf.image.random_jpeg_quality(img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality)
+ + + +
+[docs] +class GaussianBlur(NestedObject): + """Randomly adjust jpeg quality of a 3 dimensional RGB image + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianBlur + >>> transfo = GaussianBlur(3, (.1, 5)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + kernel_shape: size of the blurring kernel + std: min and max value of the standard deviation + """ + + def __init__(self, kernel_shape: Union[int, Iterable[int]], std: Tuple[float, float]) -> None: + self.kernel_shape = kernel_shape + self.std = std + + def extra_repr(self) -> str: + return f"kernel_shape={self.kernel_shape}, std={self.std}" + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.squeeze( + _gaussian_filter( + img[tf.newaxis, ...], + kernel_size=self.kernel_shape, + sigma=random.uniform(self.std[0], self.std[1]), + mode="REFLECT", + ), + axis=0, )
+ + +
+[docs] +class ChannelShuffle(NestedObject): + """Randomly shuffle channel order of a given image""" + + def __init__(self): + pass + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.transpose(tf.random.shuffle(tf.transpose(img, perm=[2, 0, 1])), perm=[1, 2, 0])
+ + + +
+[docs] +class GaussianNoise(NestedObject): + """Adds Gaussian Noise to the input tensor + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianNoise + >>> transfo = GaussianNoise(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + mean : mean of the gaussian distribution + std : std of the gaussian distribution + """ + + def __init__(self, mean: float = 0.0, std: float = 1.0) -> None: + super().__init__() + self.std = std + self.mean = mean + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + noise = self.mean + 2 * self.std * tf.random.uniform(x.shape) - self.std + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value(tf.math.round(tf.cast(x, dtype=tf.float32) + 255 * noise), 0, 255), dtype=tf.uint8 + ) + else: + return tf.cast(tf.clip_by_value(x + noise, 0, 1), dtype=x.dtype) + + def extra_repr(self) -> str: + return f"mean={self.mean}, std={self.std}"
+ + + +
+[docs] +class RandomHorizontalFlip(NestedObject): + """Adds random horizontal flip to the input tensor/np.ndarray + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHorizontalFlip + >>> transfo = RandomHorizontalFlip(p=0.5) + >>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1) + >>> target = np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32) + >>> out = transfo(image, target) + + Args: + ---- + p : probability of Horizontal Flip + """ + + def __init__(self, p: float) -> None: + super().__init__() + self.p = p + + def __call__(self, img: Union[tf.Tensor, np.ndarray], target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + _img = tf.image.flip_left_right(img) + _target = target.copy() + # Changing the relative bbox coordinates + if target.shape[1:] == (4,): + _target[:, ::2] = 1 - target[:, [2, 0]] + else: + _target[..., 0] = 1 - target[..., 0] + return _img, _target + return img, target
+ + + +
+[docs] +class RandomShadow(NestedObject): + """Adds random shade to the input image + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomShadow + >>> transfo = RandomShadow(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + opacity_range : minimum and maximum opacity of the shade + """ + + def __init__(self, opacity_range: Optional[Tuple[float, float]] = None) -> None: + super().__init__() + self.opacity_range = opacity_range if isinstance(opacity_range, tuple) else (0.2, 0.8) + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value( + tf.math.round(255 * random_shadow(tf.cast(x, dtype=tf.float32) / 255, self.opacity_range)), + 0, + 255, + ), + dtype=tf.uint8, + ) + else: + return tf.clip_by_value(random_shadow(x, self.opacity_range), 0, 1) + + def extra_repr(self) -> str: + return f"opacity_range={self.opacity_range}"
+ + + +
+[docs] +class RandomResize(NestedObject): + """Randomly resize the input image and align corresponding targets + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomResize + >>> transfo = RandomResize((0.3, 0.9), preserve_aspect_ratio=True, symmetric_pad=True, p=0.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + scale_range: range of the resizing factor for width and height (independently) + preserve_aspect_ratio: whether to preserve the aspect ratio of the image, + given a float value, the aspect ratio will be preserved with this probability + symmetric_pad: whether to symmetrically pad the image, + given a float value, the symmetric padding will be applied with this probability + p: probability to apply the transformation + """ + + def __init__( + self, + scale_range: Tuple[float, float] = (0.3, 0.9), + preserve_aspect_ratio: Union[bool, float] = False, + symmetric_pad: Union[bool, float] = False, + p: float = 0.5, + ): + super().__init__() + self.scale_range = scale_range + self.preserve_aspect_ratio = preserve_aspect_ratio + self.symmetric_pad = symmetric_pad + self.p = p + self._resize = Resize + + def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + scale_h = random.uniform(*self.scale_range) + scale_w = random.uniform(*self.scale_range) + new_size = (int(img.shape[-3] * scale_h), int(img.shape[-2] * scale_w)) + + _img, _target = self._resize( + new_size, + preserve_aspect_ratio=self.preserve_aspect_ratio + if isinstance(self.preserve_aspect_ratio, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + symmetric_pad=self.symmetric_pad + if isinstance(self.symmetric_pad, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + )(img, target) + + return _img, _target + return img, target + + def extra_repr(self) -> str: + return f"scale_range={self.scale_range}, preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}, p={self.p}" # noqa: E501
+
@@ -638,8 +955,8 @@

Source code for doctr.transforms.modules.tensorflow

- +
+ diff --git a/v0.4.0/_modules/doctr/utils/metrics.html b/v0.4.0/_modules/doctr/utils/metrics.html index 460c64a385..8a37d5949a 100644 --- a/v0.4.0/_modules/doctr/utils/metrics.html +++ b/v0.4.0/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.metrics

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-import cv2
-from typing import List, Tuple, Dict, Optional
-from unidecode import unidecode
+from anyascii import anyascii
 from scipy.optimize import linear_sum_assignment
-from doctr.utils.geometry import rbbox_to_polygon
+from shapely.geometry import Polygon
 
-__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'rbox_to_mask',
-           'nms', 'LocalizationConfusion', 'OCRMetric']
+__all__ = [
+    "TextMatch",
+    "box_iou",
+    "polygon_iou",
+    "nms",
+    "LocalizationConfusion",
+    "OCRMetric",
+    "DetectionMetric",
+]
 
 
 def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
-    """Perform string comparison with multiple levels of tolerance
+    """Performs string comparison with multiple levels of tolerance
 
     Args:
+    ----
         word1: a string
         word2: another string
 
     Returns:
+    -------
         a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
-            unidecode counterparts and their lower-case unidecode counterparts match
+            anyascii counterparts and their lower-case anyascii counterparts match
     """
-    raw_match = (word1 == word2)
-    caseless_match = (word1.lower() == word2.lower())
-    unidecode_match = (unidecode(word1) == unidecode(word2))
+    raw_match = word1 == word2
+    caseless_match = word1.lower() == word2.lower()
+    anyascii_match = anyascii(word1) == anyascii(word2)
 
     # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
-    unicase_match = (unidecode(word1).lower() == unidecode(word2).lower())
+    unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
 
-    return raw_match, caseless_match, unidecode_match, unicase_match
+    return raw_match, caseless_match, anyascii_match, unicase_match
 
 
 
-[docs] +[docs] class TextMatch: - """Implements text match metric (word-level accuracy) for recognition task. + r"""Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \\forall X, Y \\in \\mathcal{W}^N, - TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) + \forall X, Y \in \mathcal{W}^N, + TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \\forall a, x \\in \\mathcal{W}, - f_a(x) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } x = a \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{W}` is the set of all possible character sequences, + \forall a, x \in \mathcal{W}, + f_a(x) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } x = a \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. - Example:: - >>> from doctr.utils import TextMatch - >>> metric = TextMatch() - >>> metric.update(['Hello', 'world'], ['hello', 'world']) - >>> metric.summary() + >>> from doctr.utils import TextMatch + >>> metric = TextMatch() + >>> metric.update(['Hello', 'world'], ['hello', 'world']) + >>> metric.summary() """ def __init__(self) -> None: self.reset() +
+[docs] def update( self, gt: List[str], @@ -354,29 +386,32 @@

Source code for doctr.utils.metrics

         """Update the state of the metric with new predictions
 
         Args:
+        ----
             gt: list of groung-truth character sequences
-            pred: list of predicted character sequences"""
-
+            pred: list of predicted character sequences
+        """
         if len(gt) != len(pred):
             raise AssertionError("prediction size does not match with ground-truth labels size")
 
         for gt_word, pred_word in zip(gt, pred):
-            _raw, _caseless, _unidecode, _unicase = string_match(gt_word, pred_word)
+            _raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
             self.raw += int(_raw)
             self.caseless += int(_caseless)
-            self.unidecode += int(_unidecode)
+            self.anyascii += int(_anyascii)
             self.unicase += int(_unicase)
 
-        self.total += len(gt)
+        self.total += len(gt)
+
-[docs] +[docs] def summary(self) -> Dict[str, float]: """Computes the aggregated metrics - Returns: - a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode - counterpart and its lower-case unidecode counterpart + Returns + ------- + a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii + counterpart and its lower-case anyascii counterpart """ if self.total == 0: raise AssertionError("you need to update the metric before getting the summary") @@ -384,7 +419,7 @@

Source code for doctr.utils.metrics

         return dict(
             raw=self.raw / self.total,
             caseless=self.caseless / self.total,
-            unidecode=self.unidecode / self.total,
+            anyascii=self.anyascii / self.total,
             unicase=self.unicase / self.total,
         )
@@ -392,23 +427,25 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.raw = 0
         self.caseless = 0
-        self.unidecode = 0
+        self.anyascii = 0
         self.unicase = 0
         self.total = 0
def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray: - """Compute the IoU between two sets of bounding boxes + """Computes the IoU between two sets of bounding boxes Args: + ---- boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax) boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax) + Returns: + ------- the IoU matrix of shape (N, M) """ - - iou_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) + iou_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0: l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1) @@ -419,107 +456,54 @@

Source code for doctr.utils.metrics

         right = np.minimum(r1, r2.T)
         bot = np.minimum(b1, b2.T)
 
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
+        intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
         union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
         iou_mat = intersection / union
 
     return iou_mat
 
 
-def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
-    """Compute the IoA (intersection over area) between two sets of bounding boxes:
-    ioa(i, j) = inter(i, j) / area(i)
-
-    Args:
-        boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
-        boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
-    Returns:
-        the IoA matrix of shape (N, M)
-    """
-
-    ioa_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
-
-    if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
-        l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
-        l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
-
-        left = np.maximum(l1, l2.T)
-        top = np.maximum(t1, t2.T)
-        right = np.minimum(r1, r2.T)
-        bot = np.minimum(b1, b2.T)
-
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
-        area = (r1 - l1) * (b1 - t1)
-        ioa_mat = intersection / area
-
-    return ioa_mat
-
-
-def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
-    """Compute the IoU between two sets of boolean masks
+def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
+    """Computes the IoU between two sets of rotated bounding boxes
 
     Args:
-        masks_1: boolean masks of shape (N, H, W)
-        masks_2: boolean masks of shape (M, H, W)
+    ----
+        polys_1: rotated bounding boxes of shape (N, 4, 2)
+        polys_2: rotated bounding boxes of shape (M, 4, 2)
+        mask_shape: spatial shape of the intermediate masks
+        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
 
     Returns:
+    -------
         the IoU matrix of shape (N, M)
     """
+    if polys_1.ndim != 3 or polys_2.ndim != 3:
+        raise AssertionError("expects boxes to be in format (N, 4, 2)")
 
-    if masks_1.shape[1:] != masks_2.shape[1:]:
-        raise AssertionError("both boolean masks should have the same spatial shape")
+    iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
 
-    iou_mat = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
+    shapely_polys_1 = [Polygon(poly) for poly in polys_1]
+    shapely_polys_2 = [Polygon(poly) for poly in polys_2]
 
-    if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
-        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...])
-        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...])
-        axes = tuple(range(2, masks_1.ndim + 1))
-        iou_mat = intersection.sum(axis=axes) / union.sum(axis=axes)
+    for i, poly1 in enumerate(shapely_polys_1):
+        for j, poly2 in enumerate(shapely_polys_2):
+            intersection_area = poly1.intersection(poly2).area
+            union_area = poly1.area + poly2.area - intersection_area
+            iou_mat[i, j] = intersection_area / union_area
 
     return iou_mat
 
 
-def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
-    """Convert boxes to masks
-
-    Args:
-        boxes: rotated bounding boxes of shape (N, 5) in format (x, y, w, h, alpha)
-        shape: spatial shapes of the output masks
-
-    Returns:
-        the boolean masks of shape (N, H, W)
-    """
-
-    masks = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
-
-    if boxes.shape[0] > 0:
-        # Get absolute coordinates
-        if boxes.dtype != np.int:
-            abs_boxes = boxes.copy()
-            abs_boxes[:, [0, 2]] = abs_boxes[:, [0, 2]] * shape[1]
-            abs_boxes[:, [1, 3]] = abs_boxes[:, [1, 3]] * shape[0]
-            abs_boxes = abs_boxes.round().astype(np.int)
-        else:
-            abs_boxes = boxes
-            abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
-
-        # TODO: optimize slicing to improve vectorization
-        for idx, _box in enumerate(abs_boxes):
-            box = rbbox_to_polygon(_box)
-            cv2.fillPoly(masks[idx], [np.array(box, np.int32)], 1)
-
-    return masks.astype(bool)
-
-
-def nms(boxes: np.ndarray, thresh: float = .5) -> List[int]:
+def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
     """Perform non-max suppression, borrowed from <https://github.com/rbgirshick/fast-rcnn>`_.
 
     Args:
+    ----
         boxes: np array of straight boxes: (*, 5), (xmin, ymin, xmax, ymax, score)
         thresh: iou threshold to perform box suppression.
 
     Returns:
+    -------
         A list of box indexes to keep
     """
     x1 = boxes[:, 0]
@@ -551,66 +535,71 @@ 

Source code for doctr.utils.metrics

 
 
 
-[docs] +[docs] class LocalizationConfusion: - """Implements common confusion metrics and mean IoU for localization evaluation. + r"""Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ - Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) + \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ + Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ + Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ + meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \\forall y \\in \\mathcal{B}, - g_X(y) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, + \forall y \in \mathcal{B}, + g_X(y) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import LocalizationConfusion - >>> metric = LocalizationConfusion(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import LocalizationConfusion + >>> metric = LocalizationConfusion(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: + """Updates the metric + Args: + ---- + gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + """ if preds.shape[0] > 0: # Compute IoU - if self.rotated_bbox: - mask_gts = rbox_to_mask(gts, shape=self.mask_shape) - mask_preds = rbox_to_mask(preds, shape=self.mask_shape) - iou_mat = mask_iou(mask_gts, mask_preds) + if self.use_polygons: + iou_mat = polygon_iou(gts, preds) else: iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=1).sum()) + self.tot_iou += float(iou_mat.max(axis=0).sum()) # Assign pairs gt_indices, pred_indices = linear_sum_assignment(-iou_mat) @@ -618,17 +607,18 @@

Source code for doctr.utils.metrics

 
         # Update counts
         self.num_gts += gts.shape[0]
-        self.num_preds += preds.shape[0]
+        self.num_preds += preds.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: """Computes the aggregated metrics - Returns: + Returns + ------- a tuple with the recall, precision and meanIoU scores """ - # Recall recall = self.matches / self.num_gts if self.num_gts > 0 else None @@ -636,7 +626,7 @@

Source code for doctr.utils.metrics

         precision = self.matches / self.num_preds if self.num_preds > 0 else None
 
         # mean IoU
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -645,64 +635,65 @@

Source code for doctr.utils.metrics

         self.num_gts = 0
         self.num_preds = 0
         self.matches = 0
-        self.tot_iou = 0.
+ self.tot_iou = 0.0
-[docs] +[docs] class OCRMetric: - """Implements end-to-end OCR metric. + r"""Implements an end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, - \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ - Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) + \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, + \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ + Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, - h_{B,L}(b, l) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ - & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, - :math:`\\mathcal{L}` is the set of possible character sequences, + \forall (b, l) \in \mathcal{B} \times \mathcal{L}, + h_{B,L}(b, l) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import OCRMetric - >>> metric = OCRMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - ['hello'], ['hello', 'world']) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import OCRMetric + >>> metric = OCRMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> ['hello'], ['hello', 'world']) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update( self, gt_boxes: np.ndarray, @@ -710,50 +701,58 @@

Source code for doctr.utils.metrics

         gt_labels: List[str],
         pred_labels: List[str],
     ) -> None:
+        """Updates the metric
 
+        Args:
+        ----
+            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
+            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
+            gt_labels: a list of N string labels
+            pred_labels: a list of M string labels
+        """
         if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
-            raise AssertionError("there should be the same number of boxes and string both for the ground truth "
-                                 "and the predictions")
+            raise AssertionError(
+                "there should be the same number of boxes and string both for the ground truth and the predictions"
+            )
 
         # Compute IoU
         if pred_boxes.shape[0] > 0:
-            if self.rotated_bbox:
-                mask_gts = rbox_to_mask(gt_boxes, shape=self.mask_shape)
-                mask_preds = rbox_to_mask(pred_boxes, shape=self.mask_shape)
-                iou_mat = mask_iou(mask_gts, mask_preds)
+            if self.use_polygons:
+                iou_mat = polygon_iou(gt_boxes, pred_boxes)
             else:
                 iou_mat = box_iou(gt_boxes, pred_boxes)
 
-            self.tot_iou += float(iou_mat.max(axis=1).sum())
+            self.tot_iou += float(iou_mat.max(axis=0).sum())
 
             # Assign pairs
             gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
             is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
             # String comparison
             for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
-                _raw, _caseless, _unidecode, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
+                _raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
                 self.raw_matches += int(_raw)
                 self.caseless_matches += int(_caseless)
-                self.unidecode_matches += int(_unidecode)
+                self.anyascii_matches += int(_anyascii)
                 self.unicase_matches += int(_unicase)
 
         self.num_gts += gt_boxes.shape[0]
-        self.num_preds += pred_boxes.shape[0]
+        self.num_preds += pred_boxes.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float]], Optional[float]]: """Computes the aggregated metrics - Returns: - a tuple with the recall & precision for each string comparison flexibility and the mean IoU + Returns + ------- + a tuple with the recall & precision for each string comparison and the mean IoU """ - # Recall recall = dict( raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None, caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None, - unidecode=self.unidecode_matches / self.num_gts if self.num_gts > 0 else None, + anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None, unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None, ) @@ -761,12 +760,12 @@

Source code for doctr.utils.metrics

         precision = dict(
             raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
             caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
-            unidecode=self.unidecode_matches / self.num_preds if self.num_preds > 0 else None,
+            anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
             unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
         )
 
         # mean IoU (overall detected boxes)
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -774,12 +773,136 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.num_gts = 0
         self.num_preds = 0
-        self.tot_iou = 0.
+        self.tot_iou = 0.0
         self.raw_matches = 0
         self.caseless_matches = 0
-        self.unidecode_matches = 0
+        self.anyascii_matches = 0
         self.unicase_matches = 0
+ + +
+[docs] +class DetectionMetric: + r"""Implements an object detection metric. + + The aggregated metrics are computed as follows: + + .. math:: + \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, + \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ + Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + + with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and + :math:`y`, and the function :math:`h_{B, C}` defined as: + + .. math:: + \forall (b, c) \in \mathcal{B} \times \mathcal{C}, + h_{B,C}(b, c) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{C}` is the set of possible class indices, + :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. + + >>> import numpy as np + >>> from doctr.utils import DetectionMetric + >>> metric = DetectionMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) + >>> metric.summary() + + Args: + ---- + iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format + """ + + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: + self.iou_thresh = iou_thresh + self.use_polygons = use_polygons + self.reset() + +
+[docs] + def update( + self, + gt_boxes: np.ndarray, + pred_boxes: np.ndarray, + gt_labels: np.ndarray, + pred_labels: np.ndarray, + ) -> None: + """Updates the metric + + Args: + ---- + gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + gt_labels: an array of class indices of shape (N,) + pred_labels: an array of class indices of shape (M,) + """ + if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: + raise AssertionError( + "there should be the same number of boxes and string both for the ground truth and the predictions" + ) + + # Compute IoU + if pred_boxes.shape[0] > 0: + if self.use_polygons: + iou_mat = polygon_iou(gt_boxes, pred_boxes) + else: + iou_mat = box_iou(gt_boxes, pred_boxes) + + self.tot_iou += float(iou_mat.max(axis=0).sum()) + + # Assign pairs + gt_indices, pred_indices = linear_sum_assignment(-iou_mat) + is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh + # Category comparison + self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) + + self.num_gts += gt_boxes.shape[0] + self.num_preds += pred_boxes.shape[0]
+ + +
+[docs] + def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: + """Computes the aggregated metrics + + Returns + ------- + a tuple with the recall & precision for each class prediction and the mean IoU + """ + # Recall + recall = self.num_matches / self.num_gts if self.num_gts > 0 else None + + # Precision + precision = self.num_matches / self.num_preds if self.num_preds > 0 else None + + # mean IoU (overall detected boxes) + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None + + return recall, precision, mean_iou
+ + + def reset(self) -> None: + self.num_gts = 0 + self.num_preds = 0 + self.tot_iou = 0.0 + self.num_matches = 0
+
@@ -812,8 +935,8 @@

Source code for doctr.utils.metrics

       
     
   
- - + + diff --git a/v0.4.0/_modules/doctr/utils/visualization.html b/v0.4.0/_modules/doctr/utils/visualization.html index 8e7dcca811..c818be6d7b 100644 --- a/v0.4.0/_modules/doctr/utils/visualization.html +++ b/v0.4.0/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.visualization

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import colorsys
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import matplotlib.pyplot as plt
-from matplotlib.figure import Figure
+import cv2
 import matplotlib.patches as patches
-import mplcursors
-from PIL import ImageFont, ImageDraw, Image
+import matplotlib.pyplot as plt
 import numpy as np
-import cv2
-from typing import Tuple, List, Dict, Any, Union
+from matplotlib.figure import Figure
 
-from .common_types import BoundingBox, RotatedBbox
+from .common_types import BoundingBox, Polygon4P
 
-__all__ = ['visualize_page', 'synthetize_page']
+__all__ = ["visualize_page", "visualize_kie_page", "draw_boxes"]
 
 
-def create_rect_patch(
-    geometry: Union[BoundingBox, RotatedBbox],
-    label: str,
+def rect_patch(
+    geometry: BoundingBox,
     page_dimensions: Tuple[int, int],
-    color: Tuple[int, int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
     alpha: float = 0.3,
     linewidth: int = 2,
     fill: bool = True,
-) -> patches.Patch:
-    """Create a matplotlib patch (rectangle) bounding the element
+    preserve_aspect_ratio: bool = False,
+) -> patches.Rectangle:
+    """Create a matplotlib rectangular patch for the element
 
     Args:
+    ----
         geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
         label: label to display when hovered
-        page_dimensions: dimensions of the Page
         color: color to draw box
         alpha: opacity parameter to fill the boxes, 0 = transparent
         linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
 
     Returns:
+    -------
         a rectangular Patch
     """
+    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
     height, width = page_dimensions
-    if len(geometry) == 5:
-        x, y, w, h, a = geometry  # type: ignore[misc]
-        x, w = x * width, w * width
-        y, h = y * height, h * height
-        points = cv2.boxPoints(((x, y), (w, h), a))
-        return patches.Polygon(
-            points,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
-    else:
-        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
-        xmin, xmax = xmin * width, xmax * width
-        ymin, ymax = ymin * height, ymax * height
-        return patches.Rectangle(
-            (xmin, ymin),
-            xmax - xmin,
-            ymax - ymin,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
+    (xmin, ymin), (xmax, ymax) = geometry
+    # Switch to absolute coords
+    if preserve_aspect_ratio:
+        width = height = max(height, width)
+    xmin, w = xmin * width, (xmax - xmin) * width
+    ymin, h = ymin * height, (ymax - ymin) * height
+
+    return patches.Rectangle(
+        (xmin, ymin),
+        w,
+        h,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def polygon_patch(
+    geometry: np.ndarray,
+    page_dimensions: Tuple[int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
+    alpha: float = 0.3,
+    linewidth: int = 2,
+    fill: bool = True,
+    preserve_aspect_ratio: bool = False,
+) -> patches.Polygon:
+    """Create a matplotlib polygon patch for the element
+
+    Args:
+    ----
+        geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
+        label: label to display when hovered
+        color: color to draw box
+        alpha: opacity parameter to fill the boxes, 0 = transparent
+        linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
+
+    Returns:
+    -------
+        a polygon Patch
+    """
+    if not geometry.shape == (4, 2):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
+    height, width = page_dimensions
+    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
+    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
+
+    return patches.Polygon(
+        geometry,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def create_obj_patch(
+    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
+    page_dimensions: Tuple[int, int],
+    **kwargs: Any,
+) -> patches.Patch:
+    """Create a matplotlib patch for the element
+
+    Args:
+    ----
+        geometry: bounding box (straight or rotated) of the element
+        page_dimensions: dimensions of the page in format (height, width)
+        **kwargs: keyword arguments for the patch
+
+    Returns:
+    -------
+        a matplotlib Patch
+    """
+    if isinstance(geometry, tuple):
+        if len(geometry) == 2:  # straight word BB (2 pts)
+            return rect_patch(geometry, page_dimensions, **kwargs)
+        elif len(geometry) == 4:  # rotated word BB (4 pts)
+            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)
+    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
+        return polygon_patch(geometry, page_dimensions, **kwargs)
+    raise ValueError("invalid geometry format")
+
+
+def get_colors(num_colors: int) -> List[Tuple[float, float, float]]:
+    """Generate num_colors color for matplotlib
+
+    Args:
+    ----
+        num_colors: number of colors to generate
+
+    Returns:
+    -------
+        colors: list of generated colors
+    """
+    colors = []
+    for i in np.arange(0.0, 360.0, 360.0 / num_colors):
+        hue = i / 360.0
+        lightness = (50 + np.random.rand() * 10) / 100.0
+        saturation = (90 + np.random.rand() * 10) / 100.0
+        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
+    return colors
 
 
 
-[docs] +[docs] def visualize_page( page: Dict[str, Any], image: np.ndarray, @@ -359,18 +472,18 @@

Source code for doctr.utils.visualization

 ) -> Figure:
     """Visualize a full page with predicted blocks, lines and words
 
-    Example::
-        >>> import numpy as np
-        >>> import matplotlib.pyplot as plt
-        >>> from doctr.utils.visualization import visualize_page
-        >>> from doctr.models import ocr_db_crnn
-        >>> model = ocr_db_crnn(pretrained=True)
-        >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
-        >>> out = model([[input_page]])
-        >>> visualize_page(out[0].pages[0].export(), input_page)
-        >>> plt.show()
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from doctr.utils.visualization import visualize_page
+    >>> from doctr.models import ocr_db_crnn
+    >>> model = ocr_db_crnn(pretrained=True)
+    >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
+    >>> out = model([[input_page]])
+    >>> visualize_page(out[0].pages[0].export(), input_page)
+    >>> plt.show()
 
     Args:
+    ----
         page: the exported Page of a Document
         image: np array of the page, needs to have the same shape than page['dimensions']
         words_only: whether only words should be displayed
@@ -378,6 +491,11 @@ 

Source code for doctr.utils.visualization

         scale: figsize of the largest windows side
         interactive: whether the plot should be interactive
         add_labels: for static plot, adds text labels on top of bounding box
+        **kwargs: keyword arguments for the polygon patch
+
+    Returns:
+    -------
+        the matplotlib figure
     """
     # Get proper scale and aspect ratio
     h, w = image.shape[:2]
@@ -386,128 +504,189 @@ 

Source code for doctr.utils.visualization

     # Display the image
     ax.imshow(image)
     # hide both axis
-    ax.axis('off')
+    ax.axis("off")
 
     if interactive:
         artists: List[patches.Patch] = []  # instantiate an empty list of patches (to be drawn on the page)
 
-    for block in page['blocks']:
+    for block in page["blocks"]:
         if not words_only:
-            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
+            rect = create_obj_patch(
+                block["geometry"], page["dimensions"], label="block", color=(0, 1, 0), linewidth=1, **kwargs
+            )
             # add patch on figure
             ax.add_patch(rect)
             if interactive:
                 # add patch to cursor's artists
                 artists.append(rect)
 
-        for line in block['lines']:
+        for line in block["lines"]:
             if not words_only:
-                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
+                rect = create_obj_patch(
+                    line["geometry"], page["dimensions"], label="line", color=(1, 0, 0), linewidth=1, **kwargs
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
-            for word in line['words']:
-                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
-                                         page['dimensions'], (0, 0, 1), **kwargs)
+            for word in line["words"]:
+                rect = create_obj_patch(
+                    word["geometry"],
+                    page["dimensions"],
+                    label=f"{word['value']} (confidence: {word['confidence']:.2%})",
+                    color=(0, 0, 1),
+                    **kwargs,
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
                 elif add_labels:
-                    if len(word['geometry']) == 5:
+                    if len(word["geometry"]) == 5:
                         text_loc = (
-                            int(page['dimensions'][1] * (word['geometry'][0] - word['geometry'][2] / 2)),
-                            int(page['dimensions'][0] * (word['geometry'][1] - word['geometry'][3] / 2))
+                            int(page["dimensions"][1] * (word["geometry"][0] - word["geometry"][2] / 2)),
+                            int(page["dimensions"][0] * (word["geometry"][1] - word["geometry"][3] / 2)),
                         )
                     else:
                         text_loc = (
-                            int(page['dimensions'][1] * word['geometry'][0][0]),
-                            int(page['dimensions'][0] * word['geometry'][0][1])
+                            int(page["dimensions"][1] * word["geometry"][0][0]),
+                            int(page["dimensions"][0] * word["geometry"][0][1]),
+                        )
+
+                    if len(word["geometry"]) == 2:
+                        # We draw only if boxes are in straight format
+                        ax.text(
+                            *text_loc,
+                            word["value"],
+                            size=10,
+                            alpha=0.5,
+                            color=(0, 0, 1),
                         )
-                    ax.text(
-                        *text_loc,
-                        word['value'],
-                        size=10,
-                        alpha=0.5,
-                        color=(0, 0, 1),
-                    )
 
         if display_artefacts:
-            for artefact in block['artefacts']:
-                rect = create_rect_patch(
-                    artefact['geometry'],
-                    'artefact',
-                    page['dimensions'],
-                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
+            for artefact in block["artefacts"]:
+                rect = create_obj_patch(
+                    artefact["geometry"],
+                    page["dimensions"],
+                    label="artefact",
+                    color=(0.5, 0.5, 0.5),
                     linewidth=1,
-                    **kwargs
+                    **kwargs,
                 )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
     if interactive:
+        import mplcursors
+
         # Create mlp Cursor to hover patches in artists
         mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
-    fig.tight_layout(pad=0.)
+    fig.tight_layout(pad=0.0)
 
     return fig
-def synthetize_page( +def visualize_kie_page( page: Dict[str, Any], - draw_proba: bool = False, - font_size: int = 13, -) -> np.ndarray: - """Draw a the content of the element page (OCR response) on a blank page. + image: np.ndarray, + words_only: bool = False, + display_artefacts: bool = True, + scale: float = 10, + interactive: bool = True, + add_labels: bool = True, + **kwargs: Any, +) -> Figure: + """Visualize a full page with predicted blocks, lines and words + + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from doctr.utils.visualization import visualize_page + >>> from doctr.models import ocr_db_crnn + >>> model = ocr_db_crnn(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([[input_page]]) + >>> visualize_kie_page(out[0].pages[0].export(), input_page) + >>> plt.show() Args: - page: exported Page object to represent - draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 - font_size: size of the font, default font = 13 + ---- + page: the exported Page of a Document + image: np array of the page, needs to have the same shape than page['dimensions'] + words_only: whether only words should be displayed + display_artefacts: whether artefacts should be displayed + scale: figsize of the largest windows side + interactive: whether the plot should be interactive + add_labels: for static plot, adds text labels on top of bounding box + **kwargs: keyword arguments for the polygon patch - Return: - A np array (drawn page) + Returns: + ------- + the matplotlib figure """ - # Draw template - h, w = page["dimensions"] - response = 255 * np.ones((h, w, 3), dtype=np.int32) + # Get proper scale and aspect ratio + h, w = image.shape[:2] + size = (scale * w / h, scale) if h > w else (scale, h / w * scale) + fig, ax = plt.subplots(figsize=size) + # Display the image + ax.imshow(image) + # hide both axis + ax.axis("off") - # Draw each word - for block in page["blocks"]: - for line in block["lines"]: - for word in line["words"]: - # Get aboslute word geometry - (xmin, ymin), (xmax, ymax) = word["geometry"] - xmin, xmax = int(w * xmin), int(w * xmax) - ymin, ymax = int(h * ymin), int(h * ymax) - - # White drawing context adapted to font size, 0.75 factor to convert pts --> pix - h_box, w_box = ymax - ymin, xmax - xmin - h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75)) - img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255)) - d = ImageDraw.Draw(img) - - # Draw in black the value of the word - d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0)) - - # Resize back to box size - img = img.resize((w_box, h_box), Image.NEAREST) - - # Colorize if draw_proba - if draw_proba: - p = int(255 * word["confidence"]) - mask = np.where(np.array(img) == 0, 1, 0) - proba = np.array([255 - p, 0, p]) - color = mask * proba[np.newaxis, np.newaxis, :] - white_mask = 255 * (1 - mask) - img = color + white_mask - - # Write to response page - response[ymin:ymax, xmin:xmax, :] = np.array(img) - - return response + if interactive: + artists: List[patches.Patch] = [] # instantiate an empty list of patches (to be drawn on the page) + + colors = {k: color for color, k in zip(get_colors(len(page["predictions"])), page["predictions"])} + for key, value in page["predictions"].items(): + for prediction in value: + if not words_only: + rect = create_obj_patch( + prediction["geometry"], + page["dimensions"], + label=f"{key} \n {prediction['value']} (confidence: {prediction['confidence']:.2%}", + color=colors[key], + linewidth=1, + **kwargs, + ) + # add patch on figure + ax.add_patch(rect) + if interactive: + # add patch to cursor's artists + artists.append(rect) + + if interactive: + import mplcursors + + # Create mlp Cursor to hover patches in artists + mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label())) + fig.tight_layout(pad=0.0) + + return fig + + +def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None: + """Draw an array of relative straight boxes on an image + + Args: + ---- + boxes: array of relative boxes, of shape (*, 4) + image: np array, float32 or uint8 + color: color to use for bounding box edges + **kwargs: keyword arguments from `matplotlib.pyplot.plot` + """ + h, w = image.shape[:2] + # Convert boxes to absolute coords + _boxes = deepcopy(boxes) + _boxes[:, [0, 2]] *= w + _boxes[:, [1, 3]] *= h + _boxes = _boxes.astype(np.int32) + for box in _boxes.tolist(): + xmin, ymin, xmax, ymax = box + image = cv2.rectangle( + image, (xmin, ymin), (xmax, ymax), color=color if isinstance(color, tuple) else (0, 0, 255), thickness=2 + ) + plt.imshow(image) + plt.plot(**kwargs)
@@ -540,8 +719,8 @@

Source code for doctr.utils.visualization

       
     
   
- - + + diff --git a/v0.4.0/_modules/index.html b/v0.4.0/_modules/index.html index e86abcd4d4..5793c44f20 100644 --- a/v0.4.0/_modules/index.html +++ b/v0.4.0/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -225,20 +225,42 @@ - - + + diff --git a/v0.4.0/_sources/changelog.rst.txt b/v0.4.0/_sources/changelog.rst.txt index 430097d6c8..35befe7b96 100644 --- a/v0.4.0/_sources/changelog.rst.txt +++ b/v0.4.0/_sources/changelog.rst.txt @@ -1,6 +1,54 @@ Changelog ========= +v0.10.0 (2024-10-21) +------------------- +Release note: `v0.10.0 `_ + +v0.9.0 (2024-08-08) +------------------- +Release note: `v0.9.0 `_ + +v0.8.1 (2024-03-04) +------------------- +Release note: `v0.8.1 `_ + +v0.8.0 (2024-02-28) +------------------- +Release note: `v0.8.0 `_ + +v0.7.0 (2023-09-09) +------------------- +Release note: `v0.7.0 `_ + +v0.6.0 (2022-09-29) +------------------- +Release note: `v0.6.0 `_ + +v0.5.1 (2022-03-22) +------------------- +Release note: `v0.5.1 `_ + +v0.5.0 (2021-12-31) +------------------- +Release note: `v0.5.0 `_ + +v0.4.1 (2021-11-22) +------------------- +Release note: `v0.4.1 `_ + +v0.4.0 (2021-10-01) +------------------- +Release note: `v0.4.0 `_ + +v0.3.1 (2021-08-27) +------------------- +Release note: `v0.3.1 `_ + +v0.3.0 (2021-07-02) +------------------- +Release note: `v0.3.0 `_ + v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.4.0/_sources/datasets.rst.txt b/v0.4.0/_sources/datasets.rst.txt deleted file mode 100644 index 354122f1e5..0000000000 --- a/v0.4.0/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.datasets.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.4.0/_sources/documents.rst.txt b/v0.4.0/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.4.0/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.4.0/_sources/getting_started/installing.rst.txt b/v0.4.0/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/v0.4.0/_sources/getting_started/installing.rst.txt +++ b/v0.4.0/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/v0.4.0/_sources/index.rst.txt b/v0.4.0/_sources/index.rst.txt index fc3ff89fdf..53251db142 100644 --- a/v0.4.0/_sources/index.rst.txt +++ b/v0.4.0/_sources/index.rst.txt @@ -1,7 +1,8 @@ -DocTR: Document Text Recognition -================================ +******************************** +docTR: Document Text Recognition +******************************** -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -9,38 +10,29 @@ State-of-the-art Optical Character Recognition made seamless & accessible to any DocTR provides an easy and powerful way to extract valuable information from your documents: -* |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. +* |:receipt:| **for automation**: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. -Welcome to the documentation of `DocTR `_! - - Main Features ------------- * |:robot:| Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor -* |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract +* |:rocket:| State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, small dependencies -* |:tools:| Daily maintained -* |:factory:| Easy integration - +* |:bird:| Light package, minimal dependencies +* |:tools:| Actively maintained by Mindee +* |:factory:| Easy integration (available templates for browser demo & API deployment) -Getting Started ---------------- .. toctree:: :maxdepth: 2 + :caption: Getting started + :hidden: - installing - - -Build & train your predictor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) -* Fine-tune or train from scratch any detection or recognition model to specialize on your data + getting_started/installing + notebooks Model zoo @@ -48,36 +40,83 @@ Model zoo Text detection models """"""""""""""""""""" - * `DBNet `_ (Differentiable Binarization) - * `LinkNet `_ +* DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ +* LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ +* FAST from `"FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation" `_ Text recognition models """"""""""""""""""""""" - * `SAR `_ (Show, Attend and Read) - * `CRNN `_ (Convolutional Recurrent Neural Network) - * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) +* SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ +* CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ +* MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ +* ViTSTR from `"Vision Transformer for Fast and Efficient Scene Text Recognition" `_ +* PARSeq from `"Scene Text Recognition with Permuted Autoregressive Sequence Models" `_ Supported datasets ^^^^^^^^^^^^^^^^^^ - * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. - * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. - * SROIE from `ICDAR 2019 `_. +* FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. +* CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. +* SROIE from `ICDAR 2019 `_. +* IIIT-5k from `CVIT `_. +* Street View Text from `"End-to-End Scene Text Recognition" `_. +* SynthText from `Visual Geometry Group `_. +* SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. +* IC03 from `ICDAR 2003 `_. +* IC13 from `ICDAR 2013 `_. +* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. +* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" `_. +* IIITHWS from `"Generating Synthetic Data for Text Recognition" `_. +* WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" `_. .. toctree:: :maxdepth: 2 - :caption: Notes + :caption: Using docTR + :hidden: - changelog + using_doctr/using_models + using_doctr/using_datasets + using_doctr/using_contrib_modules + using_doctr/sharing_models + using_doctr/using_model_export + using_doctr/custom_models_training + using_doctr/running_on_aws + + +.. toctree:: + :maxdepth: 2 + :caption: Community + :hidden: + + community/resources .. toctree:: :maxdepth: 2 :caption: Package Reference + :hidden: - datasets - documents - models - transforms - utils + modules/contrib + modules/datasets + modules/io + modules/models + modules/transforms + modules/utils + + +.. toctree:: + :maxdepth: 2 + :caption: Contributing + :hidden: + + contributing/code_of_conduct + contributing/contributing + + +.. toctree:: + :maxdepth: 2 + :caption: Notes + :hidden: + + changelog diff --git a/v0.4.0/_sources/installing.rst.txt b/v0.4.0/_sources/installing.rst.txt deleted file mode 100644 index 5c8779dc1c..0000000000 --- a/v0.4.0/_sources/installing.rst.txt +++ /dev/null @@ -1,46 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: - -* TensorFlow: `installation page `_. -* PyTorch: `installation page `_. - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.4.0/_sources/models.rst.txt b/v0.4.0/_sources/models.rst.txt deleted file mode 100644 index 9830c6c153..0000000000 --- a/v0.4.0/_sources/models.rst.txt +++ /dev/null @@ -1,215 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet16 - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 -.. autofunction:: doctr.models.recognition.master - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.4.0/_sources/transforms.rst.txt b/v0.4.0/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.4.0/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.4.0/_sources/utils.rst.txt b/v0.4.0/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.4.0/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.4.0/_static/basic.css b/v0.4.0/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.4.0/_static/basic.css +++ b/v0.4.0/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.4.0/_static/doctools.js b/v0.4.0/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.4.0/_static/doctools.js +++ b/v0.4.0/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.4.0/_static/documentation_options.js b/v0.4.0/_static/documentation_options.js index a7b5cbe04a..4f656fdbea 100644 --- a/v0.4.0/_static/documentation_options.js +++ b/v0.4.0/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.3.0a0-git', + VERSION: '0.10.1a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.4.0/_static/language_data.js b/v0.4.0/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.4.0/_static/language_data.js +++ b/v0.4.0/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.4.0/_static/searchtools.js b/v0.4.0/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.4.0/_static/searchtools.js +++ b/v0.4.0/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.4.0/changelog.html b/v0.4.0/changelog.html index eafac3a877..fc45a50384 100644 --- a/v0.4.0/changelog.html +++ b/v0.4.0/changelog.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + Changelog - docTR documentation @@ -226,20 +226,42 @@ + diff --git a/v0.4.0/community/resources.html b/v0.4.0/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.4.0/community/resources.html +++ b/v0.4.0/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.4.0/contributing/code_of_conduct.html b/v0.4.0/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/v0.4.0/contributing/code_of_conduct.html +++ b/v0.4.0/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/v0.4.0/contributing/contributing.html b/v0.4.0/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/v0.4.0/contributing/contributing.html +++ b/v0.4.0/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/v0.4.0/datasets.html b/v0.4.0/datasets.html deleted file mode 100644 index 193e576c57..0000000000 --- a/v0.4.0/datasets.html +++ /dev/null @@ -1,578 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
  • sos – optional encoding of Start Of String

  • -
  • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/documents.html b/v0.4.0/documents.html deleted file mode 100644 index 98cbb2c5ef..0000000000 --- a/v0.4.0/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/genindex.html b/v0.4.0/genindex.html index a19b433943..21520455b4 100644 --- a/v0.4.0/genindex.html +++ b/v0.4.0/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -224,20 +224,42 @@

+
+

U

+ + +
+
+

V

@@ -561,7 +711,13 @@

V

W

+
@@ -599,8 +755,8 @@

W

- - + + diff --git a/v0.4.0/getting_started/installing.html b/v0.4.0/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/v0.4.0/getting_started/installing.html +++ b/v0.4.0/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/v0.4.0/index.html b/v0.4.0/index.html index 4c6a28c66a..3a06afc6d9 100644 --- a/v0.4.0/index.html +++ b/v0.4.0/index.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + docTR documentation @@ -226,20 +226,42 @@
-

DocTR: Document Text Recognition

-

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta)

+

docTR: Document Text Recognition

+

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch

https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png

DocTR provides an easy and powerful way to extract valuable information from your documents:

    -
  • 🧾 for automation: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • +
  • 🧾 for automation: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • 👩‍🔬 for research: quickly compare your own architectures speed & performances with state-of-art models on public datasets.

-

Welcome to the documentation of DocTR!

Main Features

  • 🤖 Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters

  • ⚡ User-friendly, 3 lines of code to load a document and extract text with a predictor

  • -
  • 🚀 State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract

  • +
  • 🚀 State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract

  • ⚡ Optimized for inference speed on both CPU & GPU

  • -
  • 🐦 Light package, small dependencies

  • -
  • 🛠️ Daily maintained

  • -
  • 🏭 Easy integration

  • +
  • 🐦 Light package, minimal dependencies

  • +
  • 🛠️ Actively maintained by Mindee

  • +
  • 🏭 Easy integration (available templates for browser demo & API deployment)

-
-
-

Getting Started

-
-

Build & train your predictor

-
    -
  • Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained)

  • -
  • Fine-tune or train from scratch any detection or recognition model to specialize on your data

  • -
-

Model zoo

Text detection models

-
-

Text recognition models

-
-

Supported datasets

-
-
+
+
+
+
+
@@ -406,7 +381,7 @@

Supported datasets - +
Next @@ -446,10 +421,8 @@

Supported datasets + diff --git a/v0.4.0/installing.html b/v0.4.0/installing.html deleted file mode 100644 index b61c60134b..0000000000 --- a/v0.4.0/installing.html +++ /dev/null @@ -1,395 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

- -

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/models.html b/v0.4.0/models.html deleted file mode 100644 index b5cd44c9fa..0000000000 --- a/v0.4.0/models.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet16
->>> model = linknet16(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
-

MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. -Example:

-
>>> import tensorflow as tf
->>> from doctr.models import master
->>> model = master(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

59.50

62.50

75.30

70.00

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

66.00

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/modules/contrib.html b/v0.4.0/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.4.0/modules/contrib.html +++ b/v0.4.0/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.4.0/modules/datasets.html b/v0.4.0/modules/datasets.html index 456e10b172..380a986793 100644 --- a/v0.4.0/modules/datasets.html +++ b/v0.4.0/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1081,7 +1081,7 @@

Returns:

- + diff --git a/v0.4.0/modules/io.html b/v0.4.0/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/v0.4.0/modules/io.html +++ b/v0.4.0/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/v0.4.0/modules/models.html b/v0.4.0/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/v0.4.0/modules/models.html +++ b/v0.4.0/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/v0.4.0/modules/transforms.html b/v0.4.0/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/v0.4.0/modules/transforms.html +++ b/v0.4.0/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/v0.4.0/modules/utils.html b/v0.4.0/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/v0.4.0/modules/utils.html +++ b/v0.4.0/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/v0.4.0/notebooks.html b/v0.4.0/notebooks.html index f97771aebb..d36539f59e 100644 --- a/v0.4.0/notebooks.html +++ b/v0.4.0/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/v0.4.0/objects.inv b/v0.4.0/objects.inv index a22d2ce821..c1700f291b 100644 Binary files a/v0.4.0/objects.inv and b/v0.4.0/objects.inv differ diff --git a/v0.4.0/py-modindex.html b/v0.4.0/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.4.0/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.4.0/search.html b/v0.4.0/search.html index 73772822d2..d050f5eac7 100644 --- a/v0.4.0/search.html +++ b/v0.4.0/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -226,20 +226,42 @@ - - + + diff --git a/v0.4.0/searchindex.js b/v0.4.0/searchindex.js index 803f4f4bcf..6f154115ab 100644 --- a/v0.4.0/searchindex.js +++ b/v0.4.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": 5, "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": 5, "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "2023": [], "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": 3, "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": [], "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "andrej": [], "angl": 2, "ani": [1, 2, 3, 5, 6, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": [], "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "articl": [], "artifici": [], "arxiv": 5, "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": [3, 6], "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "baranovskij": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 3, "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": 2, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [1, 2, 6, 7], "box": [1, 2, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "christian": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": [], "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "cool": [], "coordin": 2, "cord": [1, 3, 5], "core": 7, "corner": [], "correct": 6, "correspond": [4, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": 4, "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "found": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gallagh": [], "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ibrahimov": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": [1, 2], "insult": [], "int": [1, 2, 5, 6, 7], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "jame": [], "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": 4, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [3, 5], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "moscardi": [], "most": 5, "mozilla": [], "multi": 3, "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": [], "netraj": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 3, 6, 7], "none": [1, 2, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 3, "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": 1, "order": [1, 2, 5], "org": 5, "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 5, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [4, 5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "parallel": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "patil": [], "pattern": [], "pdf": [2, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [3, 4], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": [], "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "realli": [], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": 4, "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "roboflow": [], "robust": 3, "root": 1, "rotat": [1, 2], "rotated_bbox": [1, 7], "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sanjin": [], "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": [3, 5], "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [1, 4], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standalon": [], "standard": 6, "start": 1, "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": [], "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unit": [], "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "verma": [], "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "video": [], "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": 4, "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "yugesh": [], "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "21": [], "22": [], "27": [], "28": 0, "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": [], "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "resourc": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/v0.4.0/transforms.html b/v0.4.0/transforms.html deleted file mode 100644 index 85e94d8a76..0000000000 --- a/v0.4.0/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.5)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[Callable[[Any], Any]])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[Callable[[Any], Any]])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: Callable[[Any], Any], p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.0/using_doctr/custom_models_training.html b/v0.4.0/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/v0.4.0/using_doctr/custom_models_training.html +++ b/v0.4.0/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/v0.4.0/using_doctr/running_on_aws.html b/v0.4.0/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/v0.4.0/using_doctr/running_on_aws.html +++ b/v0.4.0/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/v0.4.0/using_doctr/sharing_models.html b/v0.4.0/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/v0.4.0/using_doctr/sharing_models.html +++ b/v0.4.0/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/v0.4.0/using_doctr/using_contrib_modules.html b/v0.4.0/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.4.0/using_doctr/using_contrib_modules.html +++ b/v0.4.0/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.4.0/using_doctr/using_datasets.html b/v0.4.0/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/v0.4.0/using_doctr/using_datasets.html +++ b/v0.4.0/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/v0.4.0/using_doctr/using_model_export.html b/v0.4.0/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/v0.4.0/using_doctr/using_model_export.html +++ b/v0.4.0/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/v0.4.0/using_doctr/using_models.html b/v0.4.0/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/v0.4.0/using_doctr/using_models.html +++ b/v0.4.0/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/v0.4.0/utils.html b/v0.4.0/utils.html deleted file mode 100644 index e2f223f06a..0000000000 --- a/v0.4.0/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float | None, float | None, float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/datasets/cord.html b/v0.4.1/_modules/doctr/datasets/cord.html index f98ee6901c..55b0584830 100644 --- a/v0.4.1/_modules/doctr/datasets/cord.html +++ b/v0.4.1/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.cord

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
-from doctr.utils.geometry import fit_rbbox
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['CORD']
+__all__ = ["CORD"]
 
 
 
-[docs] +[docs] class CORD(VisionDataset): """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" <https://openreview.net/pdf?id=SJl3z659UH>`_. - Example:: - >>> from doctr.datasets import CORD - >>> train_set = CORD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0 + :align: center + + >>> from doctr.datasets import CORD + >>> train_set = CORD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip', - '45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_test.zip', - '8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0", + "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8", + "cord_train.zip", + ) + + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0", + "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58", + "cord_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - - # # List images - self.root = os.path.join(self._root, 'image') - self.data: List[Tuple[str, Dict[str, Any]]] = [] + # List images + tmp_root = os.path.join(self.root, "image") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] self.train = train - self.sample_transforms = sample_transforms - for img_path in os.listdir(self.root): + np_dtype = np.float32 + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem _targets = [] - with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f: label = json.load(f) for line in label["valid_line"]: for word in line["words"]: if len(word["text"]) > 0: x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] - if rotated_bbox: - box = list(fit_rbbox(np.array([ - [x[0], y[0]], - [x[1], y[1]], - [x[2], y[2]], - [x[3], y[3]], - ], dtype=np.float32))) + box: Union[List[float], np.ndarray] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + box = np.array( + [ + [x[0], y[0]], + [x[1], y[1]], + [x[2], y[2]], + [x[3], y[3]], + ], + dtype=np_dtype, + ) else: - # Reduce 8 coords to 4 + # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax box = [min(x), min(y), max(x), max(y)] - _targets.append((word['text'], box)) + _targets.append((word["text"], box)) text_targets, box_targets = zip(*_targets) - self.data.append(( - img_path, - dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets) - )) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) + ) + for crop, label in zip(crops, list(text_targets)): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -397,8 +461,8 @@

Source code for doctr.datasets.cord

       
     
   
-
- + + diff --git a/v0.4.1/_modules/doctr/datasets/core.html b/v0.4.1/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.4.1/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/datasets/datasets/tensorflow.html b/v0.4.1/_modules/doctr/datasets/datasets/tensorflow.html deleted file mode 100644 index a236abd9fe..0000000000 --- a/v0.4.1/_modules/doctr/datasets/datasets/tensorflow.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - - - - - doctr.datasets.datasets.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.datasets.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from typing import List, Any, Tuple
-import tensorflow as tf
-
-from .base import _AbstractDataset, _VisionDataset
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset(_AbstractDataset):
-
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-
-        return img, target
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset, _VisionDataset): - pass
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/datasets/detection.html b/v0.4.1/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/v0.4.1/_modules/doctr/datasets/detection.html +++ b/v0.4.1/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.4.1/_modules/doctr/datasets/doc_artefacts.html b/v0.4.1/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/v0.4.1/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.4.1/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.4.1/_modules/doctr/datasets/funsd.html b/v0.4.1/_modules/doctr/datasets/funsd.html index 35d7ad4cf5..f08612f9fa 100644 --- a/v0.4.1/_modules/doctr/datasets/funsd.html +++ b/v0.4.1/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.funsd

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['FUNSD']
+__all__ = ["FUNSD"]
 
 
 
-[docs] +[docs] class FUNSD(VisionDataset): """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. - Example:: - >>> from doctr.datasets import FUNSD - >>> train_set = FUNSD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0 + :align: center + + >>> from doctr.datasets import FUNSD + >>> train_set = FUNSD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip' - SHA256 = 'c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f' - FILE_NAME = 'funsd.zip' + URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip" + SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f" + FILE_NAME = "funsd.zip" def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + super().__init__( + self.URL, + self.FILE_NAME, + self.SHA256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - super().__init__(self.URL, self.FILE_NAME, self.SHA256, True, **kwargs) self.train = train - self.sample_transforms = sample_transforms + np_dtype = np.float32 # Use the subset - subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data') + subfolder = os.path.join("dataset", "training_data" if train else "testing_data") # # List images - self.root = os.path.join(self._root, subfolder, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + tmp_root = os.path.join(self.root, subfolder, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f: data = json.load(f) - _targets = [(word['text'], word['box']) for block in data['form'] - for word in block['words'] if len(word['text']) > 0] + _targets = [ + (word["text"], word["box"]) + for block in data["form"] + for word in block["words"] + if len(word["text"]) > 0 + ] text_targets, box_targets = zip(*_targets) - if rotated_bbox: - # box_targets: xmin, ymin, xmax, ymax -> x, y, w, h, alpha = 0 - box_targets = [ + if use_polygons: + # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners + box_targets = [ # type: ignore[assignment] [ - (box[0] + box[2]) / 2, (box[1] + box[3]) / 2, box[2] - box[0], box[3] - box[1], 0 - ] for box in box_targets + [box[0], box[1]], + [box[2], box[1]], + [box[2], box[3]], + [box[0], box[3]], + ] + for box in box_targets ] - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=int), labels=text_targets))) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype) + ) + for crop, label in zip(crops, list(text_targets)): + # filter labels with unknown characters + if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -386,8 +453,8 @@

Source code for doctr.datasets.funsd

       
     
   
-
- + + diff --git a/v0.4.1/_modules/doctr/datasets/generator/tensorflow.html b/v0.4.1/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/v0.4.1/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.4.1/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.4.1/_modules/doctr/datasets/ic03.html b/v0.4.1/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/v0.4.1/_modules/doctr/datasets/ic03.html +++ b/v0.4.1/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.4.1/_modules/doctr/datasets/ic13.html b/v0.4.1/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/v0.4.1/_modules/doctr/datasets/ic13.html +++ b/v0.4.1/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.4.1/_modules/doctr/datasets/iiit5k.html b/v0.4.1/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/v0.4.1/_modules/doctr/datasets/iiit5k.html +++ b/v0.4.1/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.4.1/_modules/doctr/datasets/iiithws.html b/v0.4.1/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/v0.4.1/_modules/doctr/datasets/iiithws.html +++ b/v0.4.1/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.4.1/_modules/doctr/datasets/imgur5k.html b/v0.4.1/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/v0.4.1/_modules/doctr/datasets/imgur5k.html +++ b/v0.4.1/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.4.1/_modules/doctr/datasets/loader.html b/v0.4.1/_modules/doctr/datasets/loader.html index d32e6da298..ed80350ef0 100644 --- a/v0.4.1/_modules/doctr/datasets/loader.html +++ b/v0.4.1/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.loader

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import math
-import tensorflow as tf
-import numpy as np
-from typing import Optional
+from typing import Callable, Optional
 
-from .multithreading import multithread_exec
+import numpy as np
+import tensorflow as tf
 
 __all__ = ["DataLoader"]
 
@@ -293,12 +314,13 @@ 

Source code for doctr.datasets.loader

     """Collate multiple elements into batches
 
     Args:
+    ----
         samples: list of N tuples containing M elements
 
     Returns:
+    -------
         Tuple of M sequences contianing N elements each
     """
-
     batch_data = zip(*samples)
 
     tf_data = tuple(tf.stack(elt, axis=0) for elt in batch_data)
@@ -307,23 +329,23 @@ 

Source code for doctr.datasets.loader

 
 
 
-[docs] +[docs] class DataLoader: """Implements a dataset wrapper for fast data loading - Example:: - >>> from doctr.datasets import FUNSD, DataLoader - >>> train_set = CORD(train=True, download=True) - >>> train_loader = DataLoader(train_set, batch_size=32) - >>> train_iter = iter(train_loader) - >>> images, targets = next(train_iter) + >>> from doctr.datasets import CORD, DataLoader + >>> train_set = CORD(train=True, download=True) + >>> train_loader = DataLoader(train_set, batch_size=32) + >>> train_iter = iter(train_loader) + >>> images, targets = next(train_iter) Args: + ---- dataset: the dataset shuffle: whether the samples should be shuffled before passing it to the iterator batch_size: number of elements in each batch drop_last: if `True`, drops the last batch if it isn't full - workers: number of workers to use for data loading + collate_fn: function to merge samples into a batch """ def __init__( @@ -332,17 +354,22 @@

Source code for doctr.datasets.loader

         shuffle: bool = True,
         batch_size: int = 1,
         drop_last: bool = False,
-        workers: Optional[int] = None,
+        collate_fn: Optional[Callable] = None,
     ) -> None:
         self.dataset = dataset
         self.shuffle = shuffle
         self.batch_size = batch_size
         nb = len(self.dataset) / batch_size
         self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
-        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
-        self.workers = workers
+        if collate_fn is None:
+            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
+        else:
+            self.collate_fn = collate_fn
         self.reset()
 
+    def __len__(self) -> int:
+        return self.num_batches
+
     def reset(self) -> None:
         # Updates indices after each epoch
         self._num_yielded = 0
@@ -358,9 +385,9 @@ 

Source code for doctr.datasets.loader

         if self._num_yielded < self.num_batches:
             # Get next indices
             idx = self._num_yielded * self.batch_size
-            indices = self.indices[idx: min(len(self.dataset), idx + self.batch_size)]
+            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
 
-            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.workers)
+            samples = list(map(self.dataset.__getitem__, indices))
 
             batch_data = self.collate_fn(samples)
 
@@ -401,8 +428,8 @@ 

Source code for doctr.datasets.loader

       
     
   
-
- +
+ diff --git a/v0.4.1/_modules/doctr/datasets/mjsynth.html b/v0.4.1/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/v0.4.1/_modules/doctr/datasets/mjsynth.html +++ b/v0.4.1/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.4.1/_modules/doctr/datasets/ocr.html b/v0.4.1/_modules/doctr/datasets/ocr.html index 11297d5952..ce1ed8b0d4 100644 --- a/v0.4.1/_modules/doctr/datasets/ocr.html +++ b/v0.4.1/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.ocr

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple
 
-from .datasets import AbstractDataset
-from doctr.utils.geometry import fit_rbbox
+import numpy as np
 
+from .datasets import AbstractDataset
 
-__all__ = ['OCRDataset']
+__all__ = ["OCRDataset"]
 
 
 
-[docs] +[docs] class OCRDataset(AbstractDataset): """Implements an OCR dataset + >>> from doctr.datasets import OCRDataset + >>> train_set = OCRDataset(img_folder="/path/to/images", + >>> label_file="/path/to/labels.json") + >>> img, target = train_set[0] + Args: + ---- img_folder: local path to image folder (all jpg at the root) label_file: local path to the label file - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) - **kwargs: keyword arguments from `VisionDataset`. + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + **kwargs: keyword arguments from `AbstractDataset`. """ def __init__( self, img_folder: str, label_file: str, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, **kwargs: Any, ) -> None: - - self.sample_transforms = sample_transforms - self.root = img_folder + super().__init__(img_folder, **kwargs) # List images self.data: List[Tuple[str, Dict[str, Any]]] = [] - with open(label_file, 'rb') as f: + np_dtype = np.float32 + with open(label_file, "rb") as f: data = json.load(f) - for file_dic in data: + for img_name, annotations in data.items(): # Get image path - img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg' + img_name = Path(img_name) # File existence check if not os.path.exists(os.path.join(self.root, img_name)): raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") # handle empty images - if (len(file_dic["coordinates"]) == 0 or - (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")): - self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[]))) + if len(annotations["typed_words"]) == 0: + self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[]))) continue - is_valid: List[bool] = [] - box_targets: List[List[float]] = [] - for box in file_dic["coordinates"]: - if rotated_bbox: - x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32)) - box = [x, y, w, h, alpha] - is_valid.append(w > 0 and h > 0) - else: - xs, ys = zip(*box) - box = [min(xs), min(ys), max(xs), max(ys)] - is_valid.append(box[0] < box[2] and box[1] < box[3]) - if is_valid[-1]: - box_targets.append(box) + # Unpack the straight boxes (xmin, ymin, xmax, ymax) + geoms = [list(map(float, obj["geometry"][:4])) for obj in annotations["typed_words"]] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + geoms = [ + [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]] # type: ignore[list-item] + for geom in geoms + ] + + text_targets = [obj["value"] for obj in annotations["typed_words"]] - text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid] - self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
+ self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))
@@ -383,8 +402,8 @@

Source code for doctr.datasets.ocr

       
     
   
- - + + diff --git a/v0.4.1/_modules/doctr/datasets/recognition.html b/v0.4.1/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/v0.4.1/_modules/doctr/datasets/recognition.html +++ b/v0.4.1/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.4.1/_modules/doctr/datasets/sroie.html b/v0.4.1/_modules/doctr/datasets/sroie.html index 66fd4ca3e0..04cf10bda2 100644 --- a/v0.4.1/_modules/doctr/datasets/sroie.html +++ b/v0.4.1/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.sroie

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import csv
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['SROIE']
+__all__ = ["SROIE"]
 
 
 
-[docs] +[docs] class SROIE(VisionDataset): """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction" <https://arxiv.org/pdf/2103.10213.pdf>`_. - Example:: - >>> from doctr.datasets import SROIE - >>> train_set = SROIE(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0 + :align: center + + >>> from doctr.datasets import SROIE + >>> train_set = SROIE(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_train_task1.zip', - 'd4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_test.zip', - '41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0", + "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f", + "sroie2019_train_task1.zip", + ) + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0", + "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2", + "sroie2019_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - self.sample_transforms = sample_transforms self.train = train - if rotated_bbox: - raise NotImplementedError + tmp_root = os.path.join(self.root, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + np_dtype = np.float32 - # # List images - self.root = os.path.join(self._root, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - _targets = [] - with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f: - for row in csv.reader(f, delimiter=','): - # Safeguard for blank lines - if len(row) > 0: - # Label may contain commas - label = ",".join(row[8:]) - # Reduce 8 coords to 4 - p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x, p4_y = map(int, row[:8]) - left, right = min(p1_x, p2_x, p3_x, p4_x), max(p1_x, p2_x, p3_x, p4_x) - top, bot = min(p1_y, p2_y, p3_y, p4_y), max(p1_y, p2_y, p3_y, p4_y) - if len(label) > 0: - _targets.append((label, [left, top, right, bot])) - - text_targets, box_targets = zip(*_targets) - - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets))) + with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f: + _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0] + + labels = [",".join(row[8:]) for row in _rows] + # reorder coordinates (8 -> (4,2) -> + # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines + coords: np.ndarray = np.stack( + [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0 + ) + + if not use_polygons: + # xmin, ymin, xmax, ymax + coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1) + + if recognition_task: + crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords) + for crop, label in zip(crops, labels): + if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0: + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, coords)) + else: + self.data.append((img_path, dict(boxes=coords, labels=labels))) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -390,8 +444,8 @@

Source code for doctr.datasets.sroie

       
     
   
-
- + + diff --git a/v0.4.1/_modules/doctr/datasets/svhn.html b/v0.4.1/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/v0.4.1/_modules/doctr/datasets/svhn.html +++ b/v0.4.1/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.4.1/_modules/doctr/datasets/svt.html b/v0.4.1/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/v0.4.1/_modules/doctr/datasets/svt.html +++ b/v0.4.1/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.4.1/_modules/doctr/datasets/synthtext.html b/v0.4.1/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/v0.4.1/_modules/doctr/datasets/synthtext.html +++ b/v0.4.1/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.4.1/_modules/doctr/datasets/utils.html b/v0.4.1/_modules/doctr/datasets/utils.html index 2259698c0f..bde9304597 100644 --- a/v0.4.1/_modules/doctr/datasets/utils.html +++ b/v0.4.1/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.utils

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import string
 import unicodedata
+from collections.abc import Sequence
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Sequence as SequenceType
+
 import numpy as np
-from typing import List, Optional, Any
+from PIL import Image
+
+from doctr.io.image import get_img_shape
+from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
 
 from .vocabs import VOCABS
 
-__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
+__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences", "pre_transform_multiclass"]
+
+ImageTensor = TypeVar("ImageTensor")
 
 
 def translate(
     input_string: str,
     vocab_name: str,
-    unknown_char: str = '■',
+    unknown_char: str = "■",
 ) -> str:
     """Translate a string input in a given vocabulary
 
     Args:
+    ----
         input_string: input string to translate
         vocab_name: vocabulary to use (french, latin, ...)
         unknown_char: unknown character for non-translatable characters
 
     Returns:
-        A string translated in a given vocab"""
-
+    -------
+        A string translated in a given vocab
+    """
     if VOCABS.get(vocab_name) is None:
         raise KeyError("output vocabulary must be in vocabs dictionnary")
 
-    translated = ''
+    translated = ""
     for char in input_string:
         if char not in VOCABS[vocab_name]:
             # we need to translate char into a vocab char
@@ -315,51 +350,63 @@ 

Source code for doctr.datasets.utils

                 # remove whitespaces
                 continue
             # normalize character if it is not in vocab
-            char = unicodedata.normalize('NFD', char).encode('ascii', 'ignore').decode('ascii')
-            if char == '' or char not in VOCABS[vocab_name]:
+            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
+            if char == "" or char not in VOCABS[vocab_name]:
                 # if normalization fails or char still not in vocab, return unknown character)
                 char = unknown_char
         translated += char
     return translated
 
 
-def encode_sequence(
+def encode_string(
     input_string: str,
     vocab: str,
 ) -> List[int]:
     """Given a predefined mapping, encode the string to a sequence of numbers
 
     Args:
+    ----
         input_string: string to encode
         vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A list encoding the input_string"""
-
-    return list(map(vocab.index, input_string))  # type: ignore[arg-type]
+    -------
+        A list encoding the input_string
+    """
+    try:
+        return list(map(vocab.index, input_string))
+    except ValueError:
+        raise ValueError(
+            f"some characters cannot be found in 'vocab'. \
+                         Please check the input string {input_string} and the vocabulary {vocab}"
+        )
 
 
 def decode_sequence(
-    input_array: np.array,
+    input_seq: Union[np.ndarray, SequenceType[int]],
     mapping: str,
 ) -> str:
     """Given a predefined mapping, decode the sequence of numbers to a string
 
     Args:
-        input_array: array to decode
+    ----
+        input_seq: array to decode
         mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A string, decoded from input_array"""
-
-    if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
+    -------
+        A string, decoded from input_seq
+    """
+    if not isinstance(input_seq, (Sequence, np.ndarray)):
+        raise TypeError("Invalid sequence type")
+    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
         raise AssertionError("Input must be an array of int, with max less than mapping size")
-    decoded = ''.join(mapping[idx] for idx in input_array)
-    return decoded
+
+    return "".join(map(mapping.__getitem__, input_seq))
 
 
 
-[docs] +[docs] def encode_sequences( sequences: List[str], vocab: str, @@ -367,48 +414,53 @@

Source code for doctr.datasets.utils

     eos: int = -1,
     sos: Optional[int] = None,
     pad: Optional[int] = None,
-    **kwargs: Any,
+    dynamic_seq_length: bool = False,
 ) -> np.ndarray:
     """Encode character sequences using a given vocab as mapping
 
     Args:
+    ----
         sequences: the list of character sequences of size N
         vocab: the ordered vocab to use for encoding
         target_size: maximum length of the encoded data
         eos: encoding of End Of String
         sos: optional encoding of Start Of String
         pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
+        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
 
     Returns:
+    -------
         the padded encoded data as a tensor
     """
-
     if 0 <= eos < len(vocab):
         raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
 
-    if not isinstance(target_size, int):
-        target_size = max(len(w) for w in sequences)
-        if sos:
-            target_size += 1
-        if pad:
-            target_size += 1
+    if not isinstance(target_size, int) or dynamic_seq_length:
+        # Maximum string length + EOS
+        max_length = max(len(w) for w in sequences) + 1
+        if isinstance(sos, int):
+            max_length += 1
+        if isinstance(pad, int):
+            max_length += 1
+        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
 
     # Pad all sequences
-    if pad:  # pad with padding symbol
+    if isinstance(pad, int):  # pad with padding symbol
         if 0 <= pad < len(vocab):
             raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
         # In that case, add EOS at the end of the word before padding
-        encoded_data = np.full([len(sequences), target_size], pad, dtype=np.int32)
+        default_symbol = pad
     else:  # pad with eos symbol
-        encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32)
+        default_symbol = eos
+    encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
 
-    for idx, seq in enumerate(sequences):
-        encoded_seq = encode_sequence(seq, vocab)
-        if pad:  # add eos at the end of the sequence
-            encoded_seq.append(eos)
-        encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)]
+    # Encode the strings
+    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
+        if isinstance(pad, int):  # add eos at the end of the sequence
+            seq.append(eos)
+        encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)]
 
-    if sos:  # place eos symbol at the beginning of each sequence
+    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
         if 0 <= sos < len(vocab):
             raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
         encoded_data = np.roll(encoded_data, 1)
@@ -416,6 +468,59 @@ 

Source code for doctr.datasets.utils

 
     return encoded_data
+ + +def convert_target_to_relative( + img: ImageTensor, target: Union[np.ndarray, Dict[str, Any]] +) -> Tuple[ImageTensor, Union[Dict[str, Any], np.ndarray]]: + if isinstance(target, np.ndarray): + target = convert_to_relative_coords(target, get_img_shape(img)) + else: + target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img)) + return img, target + + +def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]: + """Crop a set of bounding boxes from an image + + Args: + ---- + img_path: path to the image + geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4) + + Returns: + ------- + a list of cropped images + """ + with Image.open(img_path) as pil_img: + img: np.ndarray = np.array(pil_img.convert("RGB")) + # Polygon + if geoms.ndim == 3 and geoms.shape[1:] == (4, 2): + return extract_rcrops(img, geoms.astype(dtype=int)) + if geoms.ndim == 2 and geoms.shape[1] == 4: + return extract_crops(img, geoms.astype(dtype=int)) + raise ValueError("Invalid geometry format") + + +def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.ndarray, Dict[str, List]]: + """Converts multiclass target to relative coordinates. + + Args: + ---- + img: Image + target: tuple of target polygons and their classes names + + Returns: + ------- + Image and dictionary of boxes, with class names as keys + """ + boxes = convert_to_relative_coords(target[0], get_img_shape(img)) + boxes_classes = target[1] + boxes_dict: Dict = {k: [] for k in sorted(set(boxes_classes))} + for k, poly in zip(boxes_classes, boxes): + boxes_dict[k].append(poly) + boxes_dict = {k: np.stack(v, axis=0) for k, v in boxes_dict.items()} + return img, boxes_dict
@@ -448,8 +553,8 @@

Source code for doctr.datasets.utils

       
     
   
- - + + diff --git a/v0.4.1/_modules/doctr/datasets/wildreceipt.html b/v0.4.1/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.4.1/_modules/doctr/datasets/wildreceipt.html +++ b/v0.4.1/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.4.1/_modules/doctr/documents/elements.html b/v0.4.1/_modules/doctr/documents/elements.html deleted file mode 100644 index 10c1e142d2..0000000000 --- a/v0.4.1/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,577 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox, RotatedBbox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator, arg-type] - - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/documents/reader.html b/v0.4.1/_modules/doctr/documents/reader.html deleted file mode 100644 index cdcd814b6c..0000000000 --- a/v0.4.1/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,612 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence, Dict
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/io/elements.html b/v0.4.1/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/v0.4.1/_modules/doctr/io/elements.html +++ b/v0.4.1/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.4.1/_modules/doctr/io/html.html b/v0.4.1/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/v0.4.1/_modules/doctr/io/html.html +++ b/v0.4.1/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.4.1/_modules/doctr/io/image/base.html b/v0.4.1/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/v0.4.1/_modules/doctr/io/image/base.html +++ b/v0.4.1/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.4.1/_modules/doctr/io/image/tensorflow.html b/v0.4.1/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/v0.4.1/_modules/doctr/io/image/tensorflow.html +++ b/v0.4.1/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.4.1/_modules/doctr/io/pdf.html b/v0.4.1/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/v0.4.1/_modules/doctr/io/pdf.html +++ b/v0.4.1/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.4.1/_modules/doctr/io/reader.html b/v0.4.1/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/v0.4.1/_modules/doctr/io/reader.html +++ b/v0.4.1/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.4.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.4.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/v0.4.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.4.1/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.4.1/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/v0.4.1/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.4.1/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.4.1/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/v0.4.1/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.4.1/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.4.1/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.4.1/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.4.1/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.4.1/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/v0.4.1/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.4.1/_modules/doctr/models/classification/vit/tensorflow.html b/v0.4.1/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/v0.4.1/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.4.1/_modules/doctr/models/classification/zoo.html b/v0.4.1/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/v0.4.1/_modules/doctr/models/classification/zoo.html +++ b/v0.4.1/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.4.1/_modules/doctr/models/detection/differentiable_binarization.html b/v0.4.1/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.4.1/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.4.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 9145c7c3fd..66cef8663d 100644 --- a/v0.4.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import List, Tuple, Optional, Any, Dict
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+from tensorflow.keras.applications import ResNet50
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
+
+from ...classification import mobilenet_v3_large
 from .base import DBPostProcessor, _DBNet
 
-__all__ = ['DBNet', 'db_resnet50']
+__all__ = ["DBNet", "db_resnet50", "db_mobilenet_v3_large"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
+    "db_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_resnet50-649fa22b.weights.h5&src=0",
+    },
+    "db_mobilenet_v3_large": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_mobilenet_v3_large-ee2e1dbe.weights.h5&src=0",
     },
 }
 
@@ -313,6 +348,7 @@ 

Source code for doctr.models.detection.differentiable_binarization.tensorflo <https://arxiv.org/pdf/1612.03144.pdf>`_. Args: + ---- channels: number of channel to output """ @@ -322,9 +358,9 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo ) -> None: super().__init__() self.channels = channels - self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest') - self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)] - self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)] + self.upsample = layers.UpSampling2D(size=(2, 2), interpolation="nearest") + self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer="he_normal") for _ in range(4)] + self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2**idx) for idx in range(4)] @staticmethod def build_upsampling( @@ -334,20 +370,21 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo """Module which performs a 3x3 convolution followed by up-sampling Args: + ---- channels: number of output channels dilation_factor (int): dilation factor to scale the convolution output before concatenation Returns: + ------- a keras.layers.Layer object, wrapping these operations in a sequential module """ - - _layers = conv_sequence(channels, 'relu', True, kernel_size=3) + _layers = conv_sequence(channels, "relu", True, kernel_size=3) if dilation_factor > 1: - _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest')) + _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation="nearest")) - module = keras.Sequential(_layers) + module = Sequential(_layers) return module @@ -359,7 +396,6 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo x: List[tf.Tensor], **kwargs: Any, ) -> tf.Tensor: - # Channel mapping results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)] # Upsample & sum @@ -371,200 +407,324 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo return layers.concatenate(results) -class DBNet(_DBNet, keras.Model, NestedObject): +class DBNet(_DBNet, Model, NestedObject): """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_. Args: + ---- feature extractor: the backbone serving as feature extractor fpn_channels: number of channels each extracted feature maps is mapped to + bin_thresh: threshold for binarization + box_thresh: minimal objectness score to consider a box + assume_straight_pages: if True, fit straight bounding boxes only + exportable: onnx exportable returns only logits + cfg: the configuration dict of the model + class_names: list of class names """ - _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "fpn", "probability_head", "threshold_head", "postprocessor"] def __init__( self, feature_extractor: IntermediateLayerGetter, - fpn_channels: int = 128, - rotated_bbox: bool = False, + fpn_channels: int = 128, # to be set to 256 to represent the author's initial idea + bin_thresh: float = 0.3, + box_thresh: float = 0.1, + assume_straight_pages: bool = True, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, + class_names: List[str] = [CLASS_NAME], ) -> None: - super().__init__() + self.class_names = class_names + num_classes: int = len(self.class_names) self.cfg = cfg self.feat_extractor = feature_extractor - self.rotated_bbox = rotated_bbox + self.exportable = exportable + self.assume_straight_pages = assume_straight_pages self.fpn = FeaturePyramidNetwork(channels=fpn_channels) # Initialize kernels _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape] output_shape = tuple(self.fpn(_inputs).shape) - self.probability_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] + self.probability_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + self.threshold_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + + self.postprocessor = DBPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh ) - self.threshold_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] - ) - - self.postprocessor = DBPostProcessor(rotated_bbox=rotated_bbox) def compute_loss( self, out_map: tf.Tensor, thresh_map: tf.Tensor, - target: List[Dict[str, Any]] + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes and a list of masks for each image. From there it computes the loss with the model output Args: + ---- out_map: output feature map of the model of shape (N, H, W, C) thresh_map: threshold map of shape (N, H, W, C) target: list of dictionary where each dict has a `boxes` and a `flags` entry + gamma: modulating factor in the focal loss formula + alpha: balancing factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") - prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1])) - thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1])) + prob_map = tf.math.sigmoid(out_map) + thresh_map = tf.math.sigmoid(thresh_map) - seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) + seg_target, seg_mask, thresh_target, thresh_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32) + seg_mask = tf.cast(seg_mask, tf.float32) + thresh_target = tf.convert_to_tensor(thresh_target, dtype=out_map.dtype) thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool) - # Compute balanced BCE loss for proba_map - bce_scale = 5. - bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask] - - neg_target = 1 - seg_target[seg_mask] - positive_count = tf.math.reduce_sum(seg_target[seg_mask]) - negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count]) - negative_loss = bce_loss * neg_target - negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32)) - sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss) - balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6) - - # Compute dice loss for approxbin_map - bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask]))) - - bce_min = tf.math.reduce_min(bce_loss) - weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1. - inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights) - union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8 - dice_loss = 1 - 2.0 * inter / union + # Focal loss + focal_scale = 10.0 + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + + # Convert logits to prob, compute gamma factor + p_t = (seg_target * prob_map) + ((1 - seg_target) * (1 - prob_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class or for approx binary_map + if len(self.class_names) > 1: + dice_map = tf.nn.softmax(out_map, axis=-1) + else: + # compute binary map instead + dice_map = 1.0 / (1.0 + tf.exp(-50 * (prob_map - thresh_map))) + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) # Compute l1 loss for thresh_map - l1_scale = 10. if tf.reduce_any(thresh_mask): - l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask])) + thresh_mask = tf.cast(thresh_mask, tf.float32) + l1_loss = tf.reduce_sum(tf.abs(thresh_map - thresh_target) * thresh_mask) / ( + tf.reduce_sum(thresh_mask) + eps + ) else: - l1_loss = tf.constant(0.) + l1_loss = tf.constant(0.0) - return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss + return l1_loss + focal_scale * focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - feat_maps = self.feat_extractor(x, **kwargs) feat_concat = self.fpn(feat_maps, **kwargs) logits = self.probability_head(feat_concat, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: - # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + if target is None or return_preds: + # Post-process boxes (keep only text predictions) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: thresh_map = self.threshold_head(feat_concat, **kwargs) loss = self.compute_loss(logits, thresh_map, target) - out['loss'] = loss + out["loss"] = loss return out -def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet: +def _db_resnet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) # Feature extractor - resnet = tf.keras.applications.__dict__[_cfg['backbone']]( - include_top=False, - weights=None, - input_shape=_cfg['input_shape'], - pooling=None, + feat_extractor = IntermediateLayerGetter( + backbone_fn( + weights="imagenet" if pretrained_backbone else None, + include_top=False, + pooling=None, + input_shape=_cfg["input_shape"], + ), + fpn_layers, ) + # Build the model + model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + + # Load pretrained parameters + if pretrained: + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) + + return model + + +def _db_mobilenet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained + + # Patch the config + _cfg = deepcopy(default_cfgs[arch]) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = default_cfgs[arch].get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor feat_extractor = IntermediateLayerGetter( - resnet, - _cfg['fpn_layers'], + backbone_fn( + input_shape=_cfg["input_shape"], + include_top=False, + pretrained=pretrained_backbone, + ), + fpn_layers, ) - kwargs['fpn_channels'] = _cfg['fpn_channels'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] - # Build the model model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model
-[docs] +[docs] def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture Returns: + ------- text detection architecture """ + return _db_resnet( + "db_resnet50", + pretrained, + ResNet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
+ + + +
+[docs] +def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet: + """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" + <https://arxiv.org/pdf/1911.08947.pdf>`_, using a mobilenet v3 large backbone. + + >>> import tensorflow as tf + >>> from doctr.models import db_mobilenet_v3_large + >>> model = db_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) - return _db_resnet('db_resnet50', pretrained, **kwargs)
+ Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture + + Returns: + ------- + text detection architecture + """ + return _db_mobilenet( + "db_mobilenet_v3_large", + pretrained, + mobilenet_v3_large, + ["inverted_2", "inverted_5", "inverted_11", "final_block"], + **kwargs, + )

@@ -598,8 +758,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo - - + + diff --git a/v0.4.1/_modules/doctr/models/detection/fast/tensorflow.html b/v0.4.1/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.4.1/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.4.1/_modules/doctr/models/detection/linknet.html b/v0.4.1/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.4.1/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.4.1/_modules/doctr/models/detection/linknet/tensorflow.html index cd4f446673..ce995f99d4 100644 --- a/v0.4.1/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.linknet.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.classification import resnet18, resnet34, resnet50
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.backbones import ResnetStage
-from doctr.models.utils import conv_sequence, load_pretrained_params
-from .base import LinkNetPostProcessor, _LinkNet
 
-__all__ = ['LinkNet', 'linknet16']
+from .base import LinkNetPostProcessor, _LinkNet
 
+__all__ = ["LinkNet", "linknet_resnet18", "linknet_resnet34", "linknet_resnet50"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet16': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'num_classes': 1,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': None,
+    "linknet_resnet18": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet18-615a82c5.weights.h5&src=0",
+    },
+    "linknet_resnet34": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet34-9d772be5.weights.h5&src=0",
+    },
+    "linknet_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet50-6bf6c8b5.weights.h5&src=0",
     },
 }
 
 
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
+def decoder_block(in_chan: int, out_chan: int, stride: int, **kwargs: Any) -> Sequential:
     """Creates a LinkNet decoder block"""
-
     return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
+        *conv_sequence(in_chan // 4, "relu", True, kernel_size=1, **kwargs),
         layers.Conv2DTranspose(
             filters=in_chan // 4,
             kernel_size=3,
-            strides=2,
+            strides=stride,
             padding="same",
             use_bias=False,
-            kernel_initializer='he_normal'
+            kernel_initializer="he_normal",
         ),
         layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
+        layers.Activation("relu"),
+        *conv_sequence(out_chan, "relu", True, kernel_size=1),
     ])
 
 
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module"""
+class LinkNetFPN(Model, NestedObject):
+    """LinkNet Decoder module"""
 
     def __init__(
         self,
+        out_chans: int,
+        in_shapes: List[Tuple[int, ...]],
     ) -> None:
-
         super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
+        self.out_chans = out_chans
+        strides = [2] * (len(in_shapes) - 1) + [1]
+        i_chans = [s[-1] for s in in_shapes[::-1]]
+        o_chans = i_chans[1:] + [out_chans]
+        self.decoders = [
+            decoder_block(in_chan, out_chan, s, input_shape=in_shape)
+            for in_chan, out_chan, s, in_shape in zip(i_chans, o_chans, strides, in_shapes[::-1])
+        ]
+
+    def call(self, x: List[tf.Tensor], **kwargs: Any) -> tf.Tensor:
+        out = 0
+        for decoder, fmap in zip(self.decoders, x[::-1]):
+            out = decoder(out + fmap, **kwargs)
+        return out
 
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(_LinkNet, keras.Model):
+    def extra_repr(self) -> str:
+        return f"out_chans={self.out_chans}"
+
+
+class LinkNet(_LinkNet, Model):
     """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
     <https://arxiv.org/pdf/1707.03718.pdf>`_.
 
     Args:
-        num_classes: number of channels for the output
+    ----
+        feature extractor: the backbone serving as feature extractor
+        fpn_channels: number of channels each extracted feature maps is mapped to
+        bin_thresh: threshold for binarization of the output feature map
+        box_thresh: minimal objectness score to consider a box
+        assume_straight_pages: if True, fit straight bounding boxes only
+        exportable: onnx exportable returns only logits
+        cfg: the configuration dict of the model
+        class_names: list of class names
     """
 
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
+    _children_names: List[str] = ["feat_extractor", "fpn", "classifier", "postprocessor"]
 
     def __init__(
         self,
-        num_classes: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        rotated_bbox: bool = False,
+        feat_extractor: IntermediateLayerGetter,
+        fpn_channels: int = 64,
+        bin_thresh: float = 0.1,
+        box_thresh: float = 0.1,
+        assume_straight_pages: bool = True,
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
+        class_names: List[str] = [CLASS_NAME],
     ) -> None:
         super().__init__(cfg=cfg)
 
-        self.rotated_bbox = rotated_bbox
+        self.class_names = class_names
+        num_classes: int = len(self.class_names)
 
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
+        self.exportable = exportable
+        self.assume_straight_pages = assume_straight_pages
+
+        self.feat_extractor = feat_extractor
 
-        self.fpn = LinkNetFPN()
+        self.fpn = LinkNetFPN(fpn_channels, [_shape[1:] for _shape in self.feat_extractor.output_shape])
+        self.fpn.build(self.feat_extractor.output_shape)
 
         self.classifier = Sequential([
             layers.Conv2DTranspose(
@@ -393,154 +442,246 @@ 

Source code for doctr.models.detection.linknet.tensorflow

strides=2, padding="same", use_bias=False, - kernel_initializer='he_normal' + kernel_initializer="he_normal", + input_shape=self.fpn.decoders[-1].output_shape[1:], ), layers.BatchNormalization(), - layers.Activation('relu'), - *conv_sequence(32, 'relu', True, strides=1, kernel_size=3), + layers.Activation("relu"), + *conv_sequence(32, "relu", True, kernel_size=3, strides=1), layers.Conv2DTranspose( filters=num_classes, kernel_size=2, strides=2, padding="same", - use_bias=False, - kernel_initializer='he_normal' + use_bias=True, + kernel_initializer="he_normal", ), ]) - self.postprocessor = LinkNetPostProcessor(rotated_bbox=rotated_bbox) + self.postprocessor = LinkNetPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh + ) def compute_loss( self, out_map: tf.Tensor, - target: List[Dict[str, Any]], - focal_loss: bool = False, - alpha: float = .5, - gamma: float = 2., - edge_factor: float = 2., + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute linknet loss, BCE with boosted box edges or focal loss. Focal loss implementation based on <https://github.com/tensorflow/addons/>`_. Args: + ---- out_map: output feature map of the model of shape N x H x W x 1 target: list of dictionary where each dict has a `boxes` and a `flags` entry - focal_loss: if True, use focal loss instead of BCE - edge_factor: boost factor for box edges (in case of BCE) + gamma: modulating factor in the focal loss formula alpha: balancing factor in the focal loss formula - gammma: modulating factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ - seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) - edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) + seg_target, seg_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - - # Get the cross_entropy for each entry - bce = tf.keras.losses.binary_crossentropy( - seg_target[seg_mask], - tf.squeeze(out_map, axis=[-1])[seg_mask], - from_logits=True) - - if focal_loss: - if gamma and gamma < 0: - raise ValueError("Value of gamma should be greater than or equal to zero.") - - # Convert logits to prob, compute gamma factor - pred_prob = tf.sigmoid(tf.squeeze(out_map, axis=[-1])[seg_mask]) - p_t = (seg_target[seg_mask] * pred_prob) + ((1 - seg_target[seg_mask]) * (1 - pred_prob)) - modulating_factor = tf.pow((1.0 - p_t), gamma) - - # Compute alpha factor - alpha_factor = seg_target[seg_mask] * alpha + (1 - seg_target[seg_mask]) * (1 - alpha) - - # compute the final loss - loss = tf.reduce_mean(alpha_factor * modulating_factor * bce) - - else: - # Compute BCE loss with highlighted edges - loss = tf.math.multiply( - 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), - bce - ) - loss = tf.reduce_mean(loss) - - return loss + seg_mask = tf.cast(seg_mask, tf.float32) + + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + proba_map = tf.sigmoid(out_map) + + # Focal loss + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") + # Convert logits to prob, compute gamma factor + p_t = (seg_target * proba_map) + ((1 - seg_target) * (1 - proba_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class + dice_map = tf.nn.softmax(out_map, axis=-1) if len(self.class_names) > 1 else proba_map + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) + + return focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, - focal_loss: bool = True, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - - logits = self.stem(x) - logits = self.fpn(logits) - logits = self.classifier(logits) + feat_maps = self.feat_extractor(x, **kwargs) + logits = self.fpn(feat_maps, **kwargs) + logits = self.classifier(logits, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) + if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: + if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: - loss = self.compute_loss(logits, target, focal_loss) - out['loss'] = loss + loss = self.compute_loss(logits, target) + out["loss"] = loss return out -def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet: +def _linknet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> LinkNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor + feat_extractor = IntermediateLayerGetter( + backbone_fn( + pretrained=pretrained_backbone, + include_top=False, + input_shape=_cfg["input_shape"], + ), + fpn_layers, + ) - kwargs['num_classes'] = _cfg['num_classes'] - kwargs['input_shape'] = _cfg['input_shape'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model - model = LinkNet(cfg=_cfg, **kwargs) + model = LinkNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model -
-[docs] -def linknet16(pretrained: bool = False, **kwargs: Any) -> LinkNet: +
+[docs] +def linknet_resnet18(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet18 + >>> model = linknet_resnet18(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture + + Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet18", + pretrained, + resnet18, + ["resnet_block_1", "resnet_block_3", "resnet_block_5", "resnet_block_7"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet34(pretrained: bool = False, **kwargs: Any) -> LinkNet: """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" <https://arxiv.org/pdf/1707.03718.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet16 - >>> model = linknet16(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet34 + >>> model = linknet_resnet34(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture Returns: + ------- text detection architecture """ + return _linknet( + "linknet_resnet34", + pretrained, + resnet34, + ["resnet_block_2", "resnet_block_6", "resnet_block_12", "resnet_block_15"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet50(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet50 + >>> model = linknet_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture - return _linknet('linknet16', pretrained, **kwargs)
+ Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet50", + pretrained, + resnet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
@@ -574,8 +715,8 @@

Source code for doctr.models.detection.linknet.tensorflow

- +
+ diff --git a/v0.4.1/_modules/doctr/models/detection/zoo.html b/v0.4.1/_modules/doctr/models/detection/zoo.html index d3128b8d14..3651c4e2d3 100644 --- a/v0.4.1/_modules/doctr/models/detection/zoo.html +++ b/v0.4.1/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
 from doctr.file_utils import is_tf_available, is_torch_available
-from .core import DetectionPredictor
-from ..preprocessor import PreProcessor
-from .. import detection
 
+from .. import detection
+from ..detection.fast import reparameterize
+from ..preprocessor import PreProcessor
+from .predictor import DetectionPredictor
 
 __all__ = ["detection_predictor"]
 
+ARCHS: List[str]
+
 
 if is_tf_available():
-    ARCHS = ['db_resnet50', 'linknet16']
+    ARCHS = [
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
 elif is_torch_available():
-    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
+    ARCHS = [
+        "db_resnet34",
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
+
 
+def _predictor(arch: Any, pretrained: bool, assume_straight_pages: bool = True, **kwargs: Any) -> DetectionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
+        _model = detection.__dict__[arch](
+            pretrained=pretrained,
+            pretrained_backbone=kwargs.get("pretrained_backbone", True),
+            assume_straight_pages=assume_straight_pages,
+        )
+        # Reparameterize FAST models by default to lower inference latency and memory usage
+        if isinstance(_model, detection.FAST):
+            _model = reparameterize(_model)
+    else:
+        if not isinstance(arch, (detection.DBNet, detection.LinkNet, detection.FAST)):
+            raise ValueError(f"unknown architecture: {type(arch)}")
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+        _model = arch
+        _model.assume_straight_pages = assume_straight_pages
+        _model.postprocessor.assume_straight_pages = assume_straight_pages
 
-    # Detection
-    _model = detection.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 1)
+    kwargs.pop("pretrained_backbone", None)
+
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 2)
     predictor = DetectionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
-        _model
+        PreProcessor(_model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:], **kwargs),
+        _model,
     )
     return predictor
 
 
 
-[docs] -def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: +[docs] +def detection_predictor( + arch: Any = "fast_base", + pretrained: bool = False, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + batch_size: int = 2, + **kwargs: Any, +) -> DetectionPredictor: """Text detection architecture. - Example:: - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> model = detection_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import detection_predictor + >>> model = detection_predictor(arch='db_resnet50', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_resnet50') + ---- + arch: name of the architecture or model itself to use (e.g. 'db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset + assume_straight_pages: If True, fit straight boxes to the page + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right + batch_size: number of samples the model processes in parallel + **kwargs: optional keyword arguments passed to the architecture Returns: + ------- Detection predictor """ - - return _predictor(arch, pretrained, **kwargs)
+ return _predictor( + arch=arch, + pretrained=pretrained, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + batch_size=batch_size, + **kwargs, + )
@@ -367,8 +449,8 @@

Source code for doctr.models.detection.zoo

       
     
   
- - + + diff --git a/v0.4.1/_modules/doctr/models/export.html b/v0.4.1/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.4.1/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/models/factory/hub.html b/v0.4.1/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/v0.4.1/_modules/doctr/models/factory/hub.html +++ b/v0.4.1/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.4.1/_modules/doctr/models/recognition/crnn.html b/v0.4.1/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.4.1/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.4.1/_modules/doctr/models/recognition/crnn/tensorflow.html index 41cc93dd23..bc64da9a1b 100644 --- a/v0.4.1/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.crnn.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
+
 import tensorflow as tf
 from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential, Model
-from typing import Tuple, Dict, Any, Optional, List
+from tensorflow.keras.models import Model, Sequential
+
+from doctr.datasets import VOCABS
 
-from ... import backbones
-from ...utils import load_pretrained_params
+from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
+__all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
+    "crnn_vgg16_bn": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["legacy_french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_vgg16_bn-9c188f45.weights.h5&src=0",
     },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
+    "crnn_mobilenet_v3_small": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_small-54850265.weights.h5&src=0",
+    },
+    "crnn_mobilenet_v3_large": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_large-c64045e5.weights.h5&src=0",
     },
 }
 
 
 class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
+    """Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
 
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
         ignore_case: if True, ignore case of letters
         ignore_accents: if True, ignore accents of letters
@@ -325,37 +353,57 @@ 

Source code for doctr.models.recognition.crnn.tensorflow

def __call__( self, - logits: tf.Tensor - ) -> List[Tuple[str, float]]: - """ - Performs decoding of raw output with CTC and decoding of CTC predictions + logits: tf.Tensor, + beam_width: int = 1, + top_paths: int = 1, + ) -> Union[List[Tuple[str, float]], List[Tuple[List[str], List[float]]]]: + """Performs decoding of raw output with CTC and decoding of CTC predictions with label_to_idx mapping dictionnary Args: + ---- logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1 + beam_width: An int scalar >= 0 (beam search beam width). + top_paths: An int scalar >= 0, <= beam_width (controls output size). Returns: + ------- A list of decoded words of length BATCH_SIZE + """ # Decode CTC _decoded, _log_prob = tf.nn.ctc_beam_search_decoder( tf.transpose(logits, perm=[1, 0, 2]), - tf.fill(logits.shape[0], logits.shape[1]), - beam_width=1, top_paths=1, + tf.fill(tf.shape(logits)[:1], tf.shape(logits)[1]), + beam_width=beam_width, + top_paths=top_paths, ) - out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab)) - probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) + + _decoded = tf.sparse.concat( + 1, + [tf.sparse.expand_dims(dec, axis=1) for dec in _decoded], + expand_nonconcat_dims=True, + ) # dim : batchsize x beamwidth x actual_max_len_predictions + out_idxs = tf.sparse.to_dense(_decoded, default_value=len(self.vocab)) # Map it to characters _decoded_strings_pred = tf.strings.reduce_join( inputs=tf.nn.embedding_lookup(tf.constant(self._embedding, dtype=tf.string), out_idxs), - axis=-1 + axis=-1, ) _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] - word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - + decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value="not valid")[ + :, :, 0 + ] # dim : batch_size x beam_width + + if top_paths == 1: + probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) # dim : batchsize + decoded_strings_pred = tf.squeeze(decoded_strings_pred, axis=1) + word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] + else: + probs = tf.math.exp(_log_prob) # dim : batchsize x beamwidth + word_values = [[word.decode() for word in words] for words in decoded_strings_pred.numpy().tolist()] return list(zip(word_values, probs.numpy().tolist())) @@ -364,19 +412,26 @@

Source code for doctr.models.recognition.crnn.tensorflow

Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of units in the LSTM layers + exportable: onnx exportable returns only logits + beam_width: beam width for beam search decoding + top_paths: number of top paths for beam search decoding cfg: configuration dictionary """ - _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "decoder", "postprocessor"] def __init__( self, - feature_extractor: tf.keras.Model, + feature_extractor: Model, vocab: str, rnn_units: int = 128, + exportable: bool = False, + beam_width: int = 1, + top_paths: int = 1, cfg: Optional[Dict[str, Any]] = None, ) -> None: # Initialize kernels @@ -386,19 +441,21 @@

Source code for doctr.models.recognition.crnn.tensorflow

self.vocab = vocab self.max_length = w self.cfg = cfg + self.exportable = exportable self.feat_extractor = feature_extractor - self.decoder = Sequential( - [ - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Dense(units=len(vocab) + 1) - ] - ) + self.decoder = Sequential([ + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Dense(units=len(vocab) + 1), + ]) self.decoder.build(input_shape=(None, w, h * c)) self.postprocessor = CTCPostProcessor(vocab=vocab) + self.beam_width = beam_width + self.top_paths = top_paths + def compute_loss( self, model_output: tf.Tensor, @@ -407,16 +464,17 @@

Source code for doctr.models.recognition.crnn.tensorflow

"""Compute CTC loss for the model. Args: - gt: the encoded tensor with gt labels + ---- model_output: predicted logits of the model - seq_len: lengths of each gt word inside the batch + target: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) batch_len = model_output.shape[0] - input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) + input_length = tf.fill((batch_len,), model_output.shape[1]) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -428,8 +486,12 @@

Source code for doctr.models.recognition.crnn.tensorflow

target: Optional[List[str]] = None, return_model_output: bool = False, return_preds: bool = False, + beam_width: int = 1, + top_paths: int = 1, **kwargs: Any, ) -> Dict[str, Any]: + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") features = self.feat_extractor(x, **kwargs) # B x H x W x C --> B x W x H x C @@ -437,91 +499,132 @@

Source code for doctr.models.recognition.crnn.tensorflow

w, h, c = transposed_feat.get_shape().as_list()[1:] # B x W x H x C --> B x W x H * C features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c)) - logits = self.decoder(features_seq, **kwargs) + logits = _bf16_to_float32(self.decoder(features_seq, **kwargs)) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = logits + return out + if return_model_output: out["out_map"] = logits if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(logits) + out["preds"] = self.postprocessor(logits, beam_width=beam_width, top_paths=top_paths) if target is not None: - out['loss'] = self.compute_loss(logits, target) + out["loss"] = self.compute_loss(logits, target) return out -def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: +def _crnn( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> CRNN: + pretrained_backbone = pretrained_backbone and not pretrained + + kwargs["vocab"] = kwargs.get("vocab", default_cfgs[arch]["vocab"]) - # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) + _cfg["vocab"] = kwargs["vocab"] + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] - # Feature extractor - feat_extractor = backbones.__dict__[_cfg['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + input_shape=_cfg["input_shape"], include_top=False, + pretrained=pretrained_backbone, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - # Build the model model = CRNN(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params(model, _cfg["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]) return model
-[docs] +[docs] def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_vgg16_bn + >>> model = crnn_vgg16_bn(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_vgg16_bn", pretrained, vgg16_bn_r, **kwargs)
+ + + +
+[docs] +def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based + Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_small + >>> model = crnn_mobilenet_v3_small(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
+ Returns: + ------- + text recognition architecture + """ + return _crnn("crnn_mobilenet_v3_small", pretrained, mobilenet_v3_small_r, **kwargs)
-def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based +
+[docs] +def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_large + >>> model = crnn_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_mobilenet_v3_large", pretrained, mobilenet_v3_large_r, **kwargs)
- return _crnn('crnn_resnet31', pretrained, **kwargs)
@@ -554,8 +657,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

- +
+ diff --git a/v0.4.1/_modules/doctr/models/recognition/master/tensorflow.html b/v0.4.1/_modules/doctr/models/recognition/master/tensorflow.html index 2dc5a27717..aa6aa69325 100644 --- a/v0.4.1/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.master.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import tensorflow as tf
-from tensorflow.keras import layers, Sequential, Model
-from typing import Tuple, List, Dict, Any, Optional
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
 
-from ..core import RecognitionPostProcessor
-from ...backbones.resnet import ResnetStage
-from ...utils import conv_sequence, load_pretrained_params
-from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask
-from ....datasets import VOCABS
-from .base import _MASTER, _MASTERPostProcessor
+import tensorflow as tf
+from tensorflow.keras import Model, layers
+
+from doctr.datasets import VOCABS
+from doctr.models.classification import magc_resnet31
+from doctr.models.modules.transformer import Decoder, PositionalEncoding
 
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from .base import _MASTER, _MASTERPostProcessor
 
-__all__ = ['MASTER', 'master', 'MASTERPostProcessor']
+__all__ = ["MASTER", "master"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'master': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'input_shape': (48, 160, 3),
-        'vocab': VOCABS['french'],
-        'url': None,
+    "master": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/master-d7fdaeff.weights.h5&src=0",
     },
 }
 
 
-class MAGC(layers.Layer):
-
-    """Implements the Multi-Aspect Global Context Attention, as described in
-    <https://arxiv.org/pdf/1910.02562.pdf>`_.
-
-    Args:
-        inplanes: input channels
-        headers: number of headers to split channels
-        att_scale: if True, re-scale attention to counteract the variance distibutions
-        **kwargs
-    """
-
-    def __init__(
-        self,
-        inplanes: int,
-        headers: int = 1,
-        att_scale: bool = False,
-        **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-
-        self.headers = headers  # h
-        self.inplanes = inplanes  # C
-        self.att_scale = att_scale
-
-        self.single_header_inplanes = int(inplanes / headers)  # C / h
-
-        self.conv_mask = tf.keras.layers.Conv2D(
-            filters=1,
-            kernel_size=1,
-            kernel_initializer=tf.initializers.he_normal()
-        )
-
-        self.transform = tf.keras.Sequential(
-            [
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-                tf.keras.layers.LayerNormalization([1, 2, 3]),
-                tf.keras.layers.ReLU(),
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-            ],
-            name='transform'
-        )
-
-    @tf.function
-    def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor:
-        b, h, w, c = (tf.shape(inputs)[i] for i in range(4))
-
-        # B, H, W, C -->> B*h, H, W, C/h
-        x = tf.reshape(inputs, shape=(b, h, w, self.headers, self.single_header_inplanes))
-        x = tf.transpose(x, perm=(0, 3, 1, 2, 4))
-        x = tf.reshape(x, shape=(b * self.headers, h, w, self.single_header_inplanes))
-
-        # Compute shorcut
-        shortcut = x
-        # B*h, 1, H*W, C/h
-        shortcut = tf.reshape(shortcut, shape=(b * self.headers, 1, h * w, self.single_header_inplanes))
-        # B*h, 1, C/h, H*W
-        shortcut = tf.transpose(shortcut, perm=[0, 1, 3, 2])
-
-        # Compute context mask
-        # B*h, H, W, 1,
-        context_mask = self.conv_mask(x)
-        # B*h, 1, H*W, 1
-        context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1))
-        # scale variance
-        if self.att_scale and self.headers > 1:
-            context_mask = context_mask / tf.sqrt(self.single_header_inplanes)
-        # B*h, 1, H*W, 1
-        context_mask = tf.keras.activations.softmax(context_mask, axis=2)
-
-        # Compute context
-        # B*h, 1, C/h, 1
-        context = tf.matmul(shortcut, context_mask)
-        context = tf.reshape(context, shape=(b, 1, c, 1))
-        # B, 1, 1, C
-        context = tf.transpose(context, perm=(0, 1, 3, 2))
-        # Set shape to resolve shape when calling this module in the Sequential MAGCResnet
-        batch, chan = inputs.get_shape().as_list()[0], inputs.get_shape().as_list()[-1]
-        context.set_shape([batch, 1, 1, chan])
-        return context
-
-    def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
-        # Context modeling: B, H, W, C  ->  B, 1, 1, C
-        context = self.context_modeling(inputs)
-        # Transform: B, 1, 1, C  ->  B, 1, 1, C
-        transformed = self.transform(context)
-        return inputs + transformed
-
-
-class MAGCResnet(Sequential):
-
-    """Implements the modified resnet with MAGC layers, as described in paper.
-
-    Args:
-        headers: number of header to split channels in MAGC layers
-        input_shape: shape of the model input (without batch dim)
-    """
-
-    def __init__(
-        self,
-        headers: int = 1,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
-    ) -> None:
-        _layers = [
-            # conv_1x
-            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
-            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_2x
-            ResnetStage(num_blocks=1, output_channels=256),
-            MAGC(inplanes=256, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=256, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_3x
-            ResnetStage(num_blocks=2, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 1), (2, 1)),
-            # conv_4x
-            ResnetStage(num_blocks=5, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            # conv_5x
-            ResnetStage(num_blocks=3, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-        ]
-        super().__init__(_layers)
-
-
 class MASTER(_MASTER, Model):
-
     """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
     Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_.
 
     Args:
+    ----
+        feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary, (without EOS, SOS, PAD)
         d_model: d parameter for the transformer decoder
-        headers: headers for the MAGC module
         dff: depth of the pointwise feed-forward layer
         num_heads: number of heads for the mutli-head attention module
         num_layers: number of decoder layers to stack
         max_length: maximum length of character sequence handled by the model
-        input_size: size of the image inputs
+        dropout: dropout probability of the decoder
+        input_shape: size of the image inputs
+        exportable: onnx exportable returns only logits
+        cfg: dictionary containing information about the model
     """
 
     def __init__(
         self,
+        feature_extractor: Model,
         vocab: str,
         d_model: int = 512,
-        headers: int = 1,
         dff: int = 2048,
-        num_heads: int = 8,
+        num_heads: int = 8,  # number of heads in the transformer decoder
         num_layers: int = 3,
         max_length: int = 50,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
+        dropout: float = 0.2,
+        input_shape: Tuple[int, int, int] = (32, 128, 3),  # different from the paper
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__()
 
-        self.vocab = vocab
+        self.exportable = exportable
         self.max_length = max_length
+        self.d_model = d_model
+        self.vocab = vocab
         self.cfg = cfg
         self.vocab_size = len(vocab)
 
-        self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape)
-        self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model)  # 3 more classes: EOS/PAD/SOS
+        self.feat_extractor = feature_extractor
+        self.positional_encoding = PositionalEncoding(self.d_model, dropout, max_len=input_shape[0] * input_shape[1])
 
         self.decoder = Decoder(
             num_layers=num_layers,
-            d_model=d_model,
+            d_model=self.d_model,
             num_heads=num_heads,
+            vocab_size=self.vocab_size + 3,  # EOS, SOS, PAD
             dff=dff,
-            vocab_size=self.vocab_size,
-            maximum_position_encoding=max_length,
+            dropout=dropout,
+            maximum_position_encoding=self.max_length,
         )
-        self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model)
-        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
 
+        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
         self.postprocessor = MASTERPostProcessor(vocab=self.vocab)
 
     @tf.function
-    def make_mask(self, target: tf.Tensor) -> tf.Tensor:
-        look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
-        target_padding_mask = create_padding_mask(target, self.vocab_size + 2)  # Pad symbol
-        combined_mask = tf.maximum(target_padding_mask, look_ahead_mask)
-        return combined_mask
+    def make_source_and_target_mask(self, source: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+        # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
+        # (N, 1, 1, max_length)
+        target_pad_mask = tf.cast(tf.math.not_equal(target, self.vocab_size + 2), dtype=tf.uint8)
+        target_pad_mask = target_pad_mask[:, tf.newaxis, tf.newaxis, :]
+        target_length = target.shape[1]
+        # sub mask filled diagonal with 1 = see 0 = masked (max_length, max_length)
+        target_sub_mask = tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
+        # source mask filled with ones (max_length, positional_encoded_seq_len)
+        source_mask = tf.ones((target_length, source.shape[1]))
+        # combine the two masks into one boolean mask where False is masked (N, 1, max_length, max_length)
+        target_mask = tf.math.logical_and(
+            tf.cast(target_sub_mask, dtype=tf.bool), tf.cast(target_pad_mask, dtype=tf.bool)
+        )
+        return source_mask, target_mask
 
+    @staticmethod
     def compute_loss(
-        self,
         model_output: tf.Tensor,
         gt: tf.Tensor,
         seq_len: List[int],
@@ -512,11 +413,13 @@ 

Source code for doctr.models.recognition.master.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -532,7 +435,7 @@

Source code for doctr.models.recognition.master.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) @@ -547,94 +450,103 @@

Source code for doctr.models.recognition.master.tensorflow

"""Call function for training Args: + ---- x: images target: list of str labels return_model_output: if True, return logits return_preds: if True, decode logits + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A dictionnary containing eventually loss, logits and predictions. """ - # Encode - feature = self.feature_extractor(x, **kwargs) - b, h, w, c = (tf.shape(feature)[i] for i in range(4)) + feature = self.feat_extractor(x, **kwargs) + b, h, w, c = feature.get_shape() + # (N, H, W, C) --> (N, H * W, C) feature = tf.reshape(feature, shape=(b, h * w, c)) - encoded = feature + self.feature_pe[:, :h * w, :] + # add positional encoding to features + encoded = self.positional_encoding(feature, **kwargs) out: Dict[str, tf.Tensor] = {} + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") + if target is not None: # Compute target: tensor of gts and sequence lengths - gt, seq_len = self.compute_target(target) - - if kwargs.get('training', False): - if target is None: - raise AssertionError("In training mode, you need to pass a value to 'target'") - tgt_mask = self.make_mask(gt) + gt, seq_len = self.build_target(target) + # Compute decoder masks + source_mask, target_mask = self.make_source_and_target_mask(encoded, gt) # Compute logits - output = self.decoder(gt, encoded, tgt_mask, None, **kwargs) + output = self.decoder(gt, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) - else: - # When not training, we want to compute logits in with the decoder, although - # we have access to gts (we need gts to compute the loss, but not in the decoder) logits = self.decode(encoded, **kwargs) + logits = _bf16_to_float32(logits) + + if self.exportable: + out["logits"] = logits + return out + if target is not None: - out['loss'] = self.compute_loss(logits, gt, seq_len) + out["loss"] = self.compute_loss(logits, gt, seq_len) if return_model_output: - out['out_map'] = logits + out["out_map"] = logits if return_preds: - predictions = self.postprocessor(logits) - out['preds'] = predictions + out["preds"] = self.postprocessor(logits) return out + @tf.function def decode(self, encoded: tf.Tensor, **kwargs: Any) -> tf.Tensor: """Decode function for prediction Args: + ---- encoded: encoded features + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A Tuple of tf.Tensor: predictions, logits """ - b = tf.shape(encoded)[0] - max_len = tf.constant(self.max_length, dtype=tf.int32) + b = encoded.shape[0] + start_symbol = tf.constant(self.vocab_size + 1, dtype=tf.int32) # SOS padding_symbol = tf.constant(self.vocab_size + 2, dtype=tf.int32) # PAD - ys = tf.fill(dims=(b, max_len - 1), value=padding_symbol) + ys = tf.fill(dims=(b, self.max_length - 1), value=padding_symbol) start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols - # max_len = len + 2 (sos + eos) + # Final dimension include EOS/SOS/PAD for i in range(self.max_length - 1): - ys_mask = self.make_mask(ys) - output = self.decoder(ys, encoded, ys_mask, None, **kwargs) + source_mask, target_mask = self.make_source_and_target_mask(encoded, ys) + output = self.decoder(ys, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) prob = tf.nn.softmax(logits, axis=-1) - next_word = tf.argmax(prob, axis=-1, output_type=ys.dtype) - # ys.shape = B, T - i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(max_len), indexing='ij') + next_token = tf.argmax(prob, axis=-1, output_type=ys.dtype) + # update ys with the next token and ignore the first token (SOS) + i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(self.max_length), indexing="ij") indices = tf.stack([i_mesh[:, i + 1], j_mesh[:, i + 1]], axis=1) - ys = tf.tensor_scatter_nd_update(ys, indices, next_word[:, i + 1]) + ys = tf.tensor_scatter_nd_update(ys, indices, next_token[:, i]) - # final_logits of shape (N, max_length - 1, vocab_size + 1) (whithout sos) + # Shape (N, max_length, vocab_size + 1) return logits class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures + Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -649,51 +561,66 @@

Source code for doctr.models.recognition.master.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _master(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> MASTER: +def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool = True, **kwargs: Any) -> MASTER: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"]) + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) - kwargs['vocab'] = _cfg['vocab'] + kwargs["vocab"] = _cfg["vocab"] + kwargs["input_shape"] = _cfg["input_shape"] # Build the model - model = MASTER(cfg=_cfg, **kwargs) + model = MASTER( + backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg["input_shape"], include_top=False), + cfg=_cfg, + **kwargs, + ) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model
-[docs] +[docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import master - >>> model = master(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + + >>> import tensorflow as tf + >>> from doctr.models import master + >>> model = master(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keywoard arguments passed to the MASTER architecture + Returns: + ------- text recognition architecture """ - - return _master('master', pretrained, **kwargs)
+ return _master("master", pretrained, magc_resnet31, **kwargs)
@@ -727,8 +654,8 @@

Source code for doctr.models.recognition.master.tensorflow

- +
+ diff --git a/v0.4.1/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.4.1/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/v0.4.1/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.4.1/_modules/doctr/models/recognition/sar.html b/v0.4.1/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.4.1/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.4.1/_modules/doctr/models/recognition/sar/tensorflow.html index e514e4f0c4..4a591e6451 100644 --- a/v0.4.1/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.sar.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
+
 import tensorflow as tf
-from tensorflow.keras import Sequential, layers, Model
-from typing import Tuple, Dict, List, Any, Optional
+from tensorflow.keras import Model, Sequential, layers
 
-from ... import backbones
-from ...utils import load_pretrained_params
-from ..core import RecognitionModel, RecognitionPostProcessor
+from doctr.datasets import VOCABS
 from doctr.utils.repr import NestedObject
 
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
+from ...classification import resnet31
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from ..core import RecognitionModel, RecognitionPostProcessor
+
+__all__ = ["SAR", "sar_resnet31"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
+    "sar_resnet31": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/sar_resnet31-5a58806c.weights.h5&src=0",
     },
 }
 
 
+class SAREncoder(layers.Layer, NestedObject):
+    """Implements encoder module of the SAR model
+
+    Args:
+    ----
+        rnn_units: number of hidden rnn units
+        dropout_prob: dropout probability
+    """
+
+    def __init__(self, rnn_units: int, dropout_prob: float = 0.0) -> None:
+        super().__init__()
+        self.rnn = Sequential([
+            layers.LSTM(units=rnn_units, return_sequences=True, recurrent_dropout=dropout_prob),
+            layers.LSTM(units=rnn_units, return_sequences=False, recurrent_dropout=dropout_prob),
+        ])
+
+    def call(
+        self,
+        x: tf.Tensor,
+        **kwargs: Any,
+    ) -> tf.Tensor:
+        # (N, C)
+        return self.rnn(x, **kwargs)
+
+
 class AttentionModule(layers.Layer, NestedObject):
     """Implements attention module of the SAR model
 
     Args:
+    ----
         attention_units: number of hidden attention units
 
     """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
 
+    def __init__(self, attention_units: int) -> None:
         super().__init__()
         self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            3,
+            strides=1,
+            use_bias=True,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
+            1,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.flatten = layers.Flatten()
 
@@ -343,12 +395,12 @@ 

Source code for doctr.models.recognition.sar.tensorflow

hidden_state: tf.Tensor, **kwargs: Any, ) -> tf.Tensor: - [H, W] = features.get_shape().as_list()[1:3] - # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) - hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) # shape (N, H, W, vgg_units) -> (N, H, W, attention_units) features_projection = self.features_projector(features, **kwargs) + # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) + hidden_state = tf.expand_dims(tf.expand_dims(hidden_state, axis=1), axis=1) + hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) projection = tf.math.tanh(hidden_state_projection + features_projection) # shape (N, H, W, attention_units) -> (N, H, W, 1) attention = self.attention_projector(projection, **kwargs) @@ -358,23 +410,25 @@

Source code for doctr.models.recognition.sar.tensorflow

# shape (N, H * W) -> (N, H, W, 1) attention_map = tf.reshape(attention, [-1, H, W, 1]) glimpse = tf.math.multiply(features, attention_map) - # shape (N, H * W) -> (N, 1) - glimpse = tf.reduce_sum(glimpse, axis=[1, 2]) - return glimpse + # shape (N, H * W) -> (N, C) + return tf.reduce_sum(glimpse, axis=[1, 2]) class SARDecoder(layers.Layer, NestedObject): """Implements decoder module of the SAR model Args: + ---- rnn_units: number of hidden units in recurrent cells max_length: maximum length of a sequence vocab_size: number of classes in the model alphabet embedding_units: number of hidden embedding units attention_units: number of hidden attention units - num_decoder_layers: number of LSTM layers to stack + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability """ + def __init__( self, rnn_units: int, @@ -382,23 +436,22 @@

Source code for doctr.models.recognition.sar.tensorflow

vocab_size: int, embedding_units: int, attention_units: int, - num_decoder_layers: int = 2, - input_shape: Optional[List[Tuple[Optional[int]]]] = None, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, ) -> None: - super().__init__() self.vocab_size = vocab_size - self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] - ) - self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) - self.attention_module = AttentionModule(attention_units) - self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units)) self.max_length = max_length - # Initialize kernels - if input_shape is not None: - self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units))) + self.embed = layers.Dense(embedding_units, use_bias=False) + self.embed_tgt = layers.Embedding(embedding_units, self.vocab_size + 1) + + self.lstm_cells = layers.StackedRNNCells([ + layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_cells) + ]) + self.attention_module = AttentionModule(attention_units) + self.output_dense = layers.Dense(self.vocab_size + 1, use_bias=True) + self.dropout = layers.Dropout(dropout_prob) def call( self, @@ -407,40 +460,47 @@

Source code for doctr.models.recognition.sar.tensorflow

gt: Optional[tf.Tensor] = None, **kwargs: Any, ) -> tf.Tensor: - - # initialize states (each of shape (N, rnn_units)) - states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=tf.float32 - ) - # run first step of lstm - # holistic: shape (N, rnn_units) - _, states = self.lstm_decoder(holistic, states, **kwargs) - # Initialize with the index of virtual START symbol (placed after <eos>) - symbol = tf.fill(features.shape[0], self.vocab_size + 1) - logits_list = [] - if kwargs.get('training') and gt is None: - raise ValueError('Need to provide labels during training for teacher forcing') - for t in range(self.max_length + 1): # keep 1 step for <eos> - # one-hot symbol with depth vocab_size + 1 - # embeded_symbol: shape (N, embedding_units) - embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs) - logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs) - glimpse = self.attention_module( - features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs, - ) - # logits: shape (N, rnn_units), glimpse: shape (N, 1) - logits = tf.concat([logits, glimpse], axis=-1) - # shape (N, rnn_units + 1) -> (N, vocab_size + 1) - logits = self.output_dense(logits, **kwargs) - # update symbol with predicted logits for t+1 step - if kwargs.get('training'): - symbol = gt[:, t] # type: ignore[index] + if gt is not None: + gt_embedding = self.embed_tgt(gt, **kwargs) + + logits_list: List[tf.Tensor] = [] + + for t in range(self.max_length + 1): # 32 + if t == 0: + # step to init the first states of the LSTMCell + states = self.lstm_cells.get_initial_state( + inputs=None, batch_size=features.shape[0], dtype=features.dtype + ) + prev_symbol = holistic + elif t == 1: + # step to init a 'blank' sequence of length vocab_size + 1 filled with zeros + # (N, vocab_size + 1) --> (N, embedding_units) + prev_symbol = tf.zeros([features.shape[0], self.vocab_size + 1], dtype=features.dtype) + prev_symbol = self.embed(prev_symbol, **kwargs) else: - symbol = tf.argmax(logits, axis=-1) - logits_list.append(logits) - outputs = tf.stack(logits_list, axis=1) # shape (N, max_length + 1, vocab_size + 1) - - return outputs + if gt is not None and kwargs.get("training", False): + # (N, embedding_units) -2 because of <bos> and <eos> (same) + prev_symbol = self.embed(gt_embedding[:, t - 2], **kwargs) + else: + # -1 to start at timestep where prev_symbol was initialized + index = tf.argmax(logits_list[t - 1], axis=-1) + # update prev_symbol with ones at the index of the previous logit vector + prev_symbol = self.embed(self.embed_tgt(index, **kwargs), **kwargs) + + # (N, C), (N, C) take the last hidden state and cell state from current timestep + _, states = self.lstm_cells(prev_symbol, states, **kwargs) + # states = (hidden_state, cell_state) + hidden_state = states[0][0] + # (N, H, W, C), (N, C) --> (N, C) + glimpse = self.attention_module(features, hidden_state, **kwargs) + # (N, C), (N, C) --> (N, 2 * C) + logits = tf.concat([hidden_state, glimpse], axis=1) + logits = self.dropout(logits, **kwargs) + # (N, vocab_size + 1) + logits_list.append(self.output_dense(logits, **kwargs)) + + # (max_length + 1, N, vocab_size + 1) --> (N, max_length + 1, vocab_size + 1) + return tf.transpose(tf.stack(logits_list[1:]), (1, 0, 2)) class SAR(Model, RecognitionModel): @@ -448,17 +508,20 @@

Source code for doctr.models.recognition.sar.tensorflow

Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of hidden units in both encoder and decoder LSTM embedding_units: number of embedding units attention_units: number of hidden units in attention module max_length: maximum word length handled by the model - num_decoders: number of LSTM to stack in decoder layer - + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability for the encoder and decoder + exportable: onnx exportable returns only logits + cfg: dictionary containing information about the model """ - _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "encoder", "decoder", "postprocessor"] def __init__( self, @@ -468,36 +531,34 @@

Source code for doctr.models.recognition.sar.tensorflow

embedding_units: int = 512, attention_units: int = 512, max_length: int = 30, - num_decoders: int = 2, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: - super().__init__() self.vocab = vocab + self.exportable = exportable self.cfg = cfg - self.max_length = max_length + 1 # Add 1 timestep for EOS after the longest word self.feat_extractor = feature_extractor - self.encoder = Sequential( - [ - layers.LSTM(units=rnn_units, return_sequences=True), - layers.LSTM(units=rnn_units, return_sequences=False) - ] - ) - # Initialize the kernels (watch out for reduce_max) - self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:]) - + self.encoder = SAREncoder(rnn_units, dropout_prob) self.decoder = SARDecoder( - rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders, - input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape] + rnn_units, + self.max_length, + len(vocab), + embedding_units, + attention_units, + num_decoder_cells, + dropout_prob, ) self.postprocessor = SARPostProcessor(vocab=vocab) + @staticmethod def compute_loss( - self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: tf.Tensor, @@ -506,11 +567,13 @@

Source code for doctr.models.recognition.sar.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -525,7 +588,7 @@

Source code for doctr.models.recognition.sar.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -536,16 +599,28 @@

Source code for doctr.models.recognition.sar.tensorflow

return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - features = self.feat_extractor(x, **kwargs) - pooled_features = tf.reduce_max(features, axis=1) # vertical max pooling + # vertical max pooling --> (N, C, W) + pooled_features = tf.reduce_max(features, axis=1) + # holistic (N, C) encoded = self.encoder(pooled_features, **kwargs) + if target is not None: - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) seq_len = tf.cast(seq_len, tf.int32) - decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training for teacher forcing") + + decoded_features = _bf16_to_float32( + self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + ) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = decoded_features + return out + if return_model_output: out["out_map"] = decoded_features @@ -554,7 +629,7 @@

Source code for doctr.models.recognition.sar.tensorflow

out["preds"] = self.postprocessor(decoded_features) if target is not None: - out['loss'] = self.compute_loss(decoded_features, gt, seq_len) + out["loss"] = self.compute_loss(decoded_features, gt, seq_len) return out @@ -563,9 +638,8 @@

Source code for doctr.models.recognition.sar.tensorflow

"""Post processor for SAR architectures Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -580,95 +654,75 @@

Source code for doctr.models.recognition.sar.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: +def _sar( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> SAR: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) - _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units']) - _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units']) - _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length']) - _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) # Feature extractor - feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + pretrained=pretrained_backbone, + input_shape=_cfg["input_shape"], include_top=False, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - kwargs['embedding_units'] = _cfg['embedding_units'] - kwargs['attention_units'] = _cfg['attention_units'] - kwargs['max_length'] = _cfg['max_length'] - kwargs['num_decoders'] = _cfg['num_decoders'] + kwargs["vocab"] = _cfg["vocab"] # Build the model model = SAR(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model -
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - -
-[docs] +[docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import sar_resnet31 + >>> model = sar_resnet31(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the SAR architecture Returns: + ------- text recognition architecture """ - - return _sar('sar_resnet31', pretrained, **kwargs)
+ return _sar("sar_resnet31", pretrained, resnet31, **kwargs)
@@ -702,8 +756,8 @@

Source code for doctr.models.recognition.sar.tensorflow

- +
+ diff --git a/v0.4.1/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.4.1/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/v0.4.1/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.4.1/_modules/doctr/models/recognition/zoo.html b/v0.4.1/_modules/doctr/models/recognition/zoo.html index bf0ae6af6e..f664304019 100644 --- a/v0.4.1/_modules/doctr/models/recognition/zoo.html +++ b/v0.4.1/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
-from doctr.file_utils import is_tf_available, is_torch_available
-from .core import RecognitionPredictor
-from ..preprocessor import PreProcessor
-from .. import recognition
+from doctr.file_utils import is_tf_available
+from doctr.models.preprocessor import PreProcessor
 
+from .. import recognition
+from .predictor import RecognitionPredictor
 
 __all__ = ["recognition_predictor"]
 
 
-if is_tf_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
-elif is_torch_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
+ARCHS: List[str] = [
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "sar_resnet31",
+    "master",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
+]
+
 
+def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+        _model = recognition.__dict__[arch](
+            pretrained=pretrained, pretrained_backbone=kwargs.get("pretrained_backbone", True)
+        )
+    else:
+        if not isinstance(
+            arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq)
+        ):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+        _model = arch
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+    kwargs.pop("pretrained_backbone", None)
 
-    _model = recognition.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 32)
-    predictor = RecognitionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
-        _model
-    )
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 128)
+    input_shape = _model.cfg["input_shape"][:2] if is_tf_available() else _model.cfg["input_shape"][-2:]
+    predictor = RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs), _model)
 
     return predictor
 
 
 
-[docs] -def recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor: +[docs] +def recognition_predictor( + arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + symmetric_pad: bool = False, + batch_size: int = 128, + **kwargs: Any, +) -> RecognitionPredictor: """Text recognition architecture. Example:: @@ -326,14 +369,18 @@

Source code for doctr.models.recognition.zoo

        >>> out = model([input_page])
 
     Args:
-        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
+    ----
+        arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
         pretrained: If True, returns a model pre-trained on our text recognition dataset
+        symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right
+        batch_size: number of samples the model processes in parallel
+        **kwargs: optional parameters to be passed to the architecture
 
     Returns:
+    -------
         Recognition predictor
     """
-
-    return _predictor(arch, pretrained, **kwargs)
+ return _predictor(arch=arch, pretrained=pretrained, symmetric_pad=symmetric_pad, batch_size=batch_size, **kwargs)
@@ -367,8 +414,8 @@

Source code for doctr.models.recognition.zoo

   
-
- +
+ diff --git a/v0.4.1/_modules/doctr/models/zoo.html b/v0.4.1/_modules/doctr/models/zoo.html index dec6857019..d459671648 100644 --- a/v0.4.1/_modules/doctr/models/zoo.html +++ b/v0.4.1/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.models.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from typing import Any
-from .core import OCRPredictor
+
 from .detection.zoo import detection_predictor
+from .kie_predictor import KIEPredictor
+from .predictor import OCRPredictor
 from .recognition.zoo import recognition_predictor
 
+__all__ = ["ocr_predictor", "kie_predictor"]
 
-__all__ = ["ocr_predictor"]
-
-
-def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
 
+def _predictor(
+    det_arch: Any,
+    reco_arch: Any,
+    pretrained: bool,
+    pretrained_backbone: bool = True,
+    assume_straight_pages: bool = True,
+    preserve_aspect_ratio: bool = True,
+    symmetric_pad: bool = True,
+    det_bs: int = 2,
+    reco_bs: int = 128,
+    detect_orientation: bool = False,
+    straighten_pages: bool = False,
+    detect_language: bool = False,
+    **kwargs,
+) -> OCRPredictor:
     # Detection
-    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
+    det_predictor = detection_predictor(
+        det_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=det_bs,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+    )
 
     # Recognition
-    reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
+    reco_predictor = recognition_predictor(
+        reco_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=reco_bs,
+    )
 
-    return OCRPredictor(det_predictor, reco_predictor)
+    return OCRPredictor(
+        det_predictor,
+        reco_predictor,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+        detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
+        detect_language=detect_language,
+        **kwargs,
+    )
 
 
 
-[docs] +[docs] def ocr_predictor( - det_arch: str = 'db_resnet50', - reco_arch: str = 'crnn_vgg16_bn', + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", pretrained: bool = False, - **kwargs: Any + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. - Example:: - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet') + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` Returns: + ------- OCR predictor """ + return _predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
+ + - return _predictor(det_arch, reco_arch, pretrained, **kwargs)
+def _kie_predictor( + det_arch: Any, + reco_arch: Any, + pretrained: bool, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + det_bs: int = 2, + reco_bs: int = 128, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs, +) -> KIEPredictor: + # Detection + det_predictor = detection_predictor( + det_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=det_bs, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + ) + + # Recognition + reco_predictor = recognition_predictor( + reco_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=reco_bs, + ) + + return KIEPredictor( + det_predictor, + reco_predictor, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + ) + + +
+[docs] +def kie_predictor( + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, +) -> KIEPredictor: + """End-to-end KIE architecture using one model for localization, and another for text recognition. + + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) + + Args: + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') + pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` + + Returns: + ------- + KIE predictor + """ + return _kie_predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
@@ -353,8 +575,8 @@

Source code for doctr.models.zoo

       
     
   
- - + + diff --git a/v0.4.1/_modules/doctr/transforms/modules.html b/v0.4.1/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.4.1/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/transforms/modules/base.html b/v0.4.1/_modules/doctr/transforms/modules/base.html index c42079a8fd..4596df3848 100644 --- a/v0.4.1/_modules/doctr/transforms/modules/base.html +++ b/v0.4.1/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.base

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
+import math
 import random
-from typing import List, Any, Callable
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import numpy as np
 
 from doctr.utils.repr import NestedObject
+
 from .. import functional as F
 
+__all__ = ["SampleCompose", "ImageTransform", "ColorInversion", "OneOf", "RandomApply", "RandomRotate", "RandomCrop"]
+
+
+class SampleCompose(NestedObject):
+    """Implements a wrapper that will apply transformations sequentially on both image and target
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfo = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), np.zeros((2, 4)))
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import torch
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfos = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfos(torch.rand(8, 64, 64, 3), np.zeros((2, 4)))
+
+    Args:
+    ----
+        transforms: list of transformation modules
+    """
+
+    _children_names: List[str] = ["sample_transforms"]
+
+    def __init__(self, transforms: List[Callable[[Any, Any], Tuple[Any, Any]]]) -> None:
+        self.sample_transforms = transforms
+
+    def __call__(self, x: Any, target: Any) -> Tuple[Any, Any]:
+        for t in self.sample_transforms:
+            x, target = t(x, target)
+
+        return x, target
+
+
+class ImageTransform(NestedObject):
+    """Implements a transform wrapper to turn an image-only transformation into an image+target transform
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), None)
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import torch
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(torch.rand(8, 64, 64, 3), None)
+
+    Args:
+    ----
+        transform: the image transformation module to wrap
+    """
+
+    _children_names: List[str] = ["img_transform"]
+
+    def __init__(self, transform: Callable[[Any], Any]) -> None:
+        self.img_transform = transform
 
-__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
+    def __call__(self, img: Any, target: Any) -> Tuple[Any, Any]:
+        img = self.img_transform(img)
+        return img, target
 
 
 
-[docs] +[docs] class ColorInversion(NestedObject): """Applies the following tranformation to a tensor (image or batch of images): convert to grayscale, colorize (shift 0-values randomly), and then invert colors - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(torch.rand(8, 64, 64, 3)) Args: + ---- min_val: range [min_val, 1] to colorize RGB pixels """ + def __init__(self, min_val: float = 0.5) -> None: self.min_val = min_val @@ -316,59 +437,178 @@

Source code for doctr.transforms.modules.base

-[docs] +[docs] class OneOf(NestedObject): """Randomly apply one of the input transformations - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transforms: list of transformations, one only will be picked """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: # Pick transformation transfo = self.transforms[int(random.random() * len(self.transforms))] # Apply - return transfo(img)
+ return transfo(img) if target is None else transfo(img, target) # type: ignore[call-arg]
-[docs] +[docs] class RandomApply(NestedObject): """Apply with a probability p the input transformation - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transform: transformation to apply p: probability to apply """ - def __init__(self, transform: Callable[[Any], Any], p: float = .5) -> None: + + def __init__(self, transform: Callable[[Any], Any], p: float = 0.5) -> None: self.transform = transform self.p = p def extra_repr(self) -> str: return f"transform={self.transform}, p={self.p}" - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: if random.random() < self.p: - return self.transform(img) - return img
+ return self.transform(img) if target is None else self.transform(img, target) # type: ignore[call-arg] + return img if target is None else (img, target)
+ + + +
+[docs] +class RandomRotate(NestedObject): + """Randomly rotate a tensor image and its boxes + + .. image:: https://doctr-static.mindee.com/models?id=v0.4.0/rotation_illustration.png&src=0 + :align: center + + Args: + ---- + max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in + [-max_angle, max_angle] + expand: whether the image should be padded before the rotation + """ + + def __init__(self, max_angle: float = 5.0, expand: bool = False) -> None: + self.max_angle = max_angle + self.expand = expand + + def extra_repr(self) -> str: + return f"max_angle={self.max_angle}, expand={self.expand}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + angle = random.uniform(-self.max_angle, self.max_angle) + r_img, r_polys = F.rotate_sample(img, target, angle, self.expand) + # Removes deleted boxes + is_kept = (r_polys.max(1) > r_polys.min(1)).sum(1) == 2 + return r_img, r_polys[is_kept]
+ + + +
+[docs] +class RandomCrop(NestedObject): + """Randomly crop a tensor image and its boxes + + Args: + ---- + scale: tuple of floats, relative (min_area, max_area) of the crop + ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w + """ + + def __init__(self, scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: + self.scale = scale + self.ratio = ratio + + def extra_repr(self) -> str: + return f"scale={self.scale}, ratio={self.ratio}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + scale = random.uniform(self.scale[0], self.scale[1]) + ratio = random.uniform(self.ratio[0], self.ratio[1]) + + height, width = img.shape[:2] + + # Calculate crop size + crop_area = scale * width * height + aspect_ratio = ratio * (width / height) + crop_width = int(round(math.sqrt(crop_area * aspect_ratio))) + crop_height = int(round(math.sqrt(crop_area / aspect_ratio))) + + # Ensure crop size does not exceed image dimensions + crop_width = min(crop_width, width) + crop_height = min(crop_height, height) + + # Randomly select crop position + x = random.randint(0, width - crop_width) + y = random.randint(0, height - crop_height) + + # relative crop box + crop_box = (x / width, y / height, (x + crop_width) / width, (y + crop_height) / height) + if target.shape[1:] == (4, 2): + min_xy = np.min(target, axis=1) + max_xy = np.max(target, axis=1) + _target = np.concatenate((min_xy, max_xy), axis=1) + else: + _target = target + + # Crop image and targets + croped_img, crop_boxes = F.crop_detection(img, _target, crop_box) + # hard fallback if no box is kept + if crop_boxes.shape[0] == 0: + return img, target + # clip boxes + return croped_img, np.clip(crop_boxes, 0, 1)
@@ -402,8 +642,8 @@

Source code for doctr.transforms.modules.base

- - + + diff --git a/v0.4.1/_modules/doctr/transforms/modules/tensorflow.html b/v0.4.1/_modules/doctr/transforms/modules/tensorflow.html index 1d192a876b..acbbe96225 100644 --- a/v0.4.1/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.4.1/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import random
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
 import tensorflow as tf
-from typing import List, Any, Tuple, Callable
 
 from doctr.utils.repr import NestedObject
 
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'RandomBrightness',
-           'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality']
+from ..functional.tensorflow import _gaussian_filter, random_shadow
+
+__all__ = [
+    "Compose",
+    "Resize",
+    "Normalize",
+    "LambdaTransformation",
+    "ToGray",
+    "RandomBrightness",
+    "RandomContrast",
+    "RandomSaturation",
+    "RandomHue",
+    "RandomGamma",
+    "RandomJpegQuality",
+    "GaussianBlur",
+    "ChannelShuffle",
+    "GaussianNoise",
+    "RandomHorizontalFlip",
+    "RandomShadow",
+    "RandomResize",
+]
 
 
 
-[docs] +[docs] class Compose(NestedObject): """Implements a wrapper that will apply transformations sequentially - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Compose, Resize + >>> transfos = Compose([Resize((32, 32))]) + >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- transforms: list of transformation modules """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms @@ -319,26 +361,27 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class Resize(NestedObject): """Resizes a tensor to a target size - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Resize + >>> transfo = Resize((32, 32)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- output_size: expected output size method: interpolation method preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically """ + def __init__( self, - output_size: Tuple[int, int], - method: str = 'bilinear', + output_size: Union[int, Tuple[int, int]], + method: str = "bilinear", preserve_aspect_ratio: bool = False, symmetric_pad: bool = False, ) -> None: @@ -346,6 +389,14 @@

Source code for doctr.transforms.modules.tensorflow

self.method = method self.preserve_aspect_ratio = preserve_aspect_ratio self.symmetric_pad = symmetric_pad + self.antialias = True + + if isinstance(self.output_size, int): + self.wanted_size = (self.output_size, self.output_size) + elif isinstance(self.output_size, (tuple, list)): + self.wanted_size = self.output_size + else: + raise AssertionError("Output size should be either a list, a tuple or an int") def extra_repr(self) -> str: _repr = f"output_size={self.output_size}, method='{self.method}'" @@ -353,64 +404,106 @@

Source code for doctr.transforms.modules.tensorflow

_repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" return _repr - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) + def __call__( + self, + img: tf.Tensor, + target: Optional[np.ndarray] = None, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: + input_dtype = img.dtype + self.output_size = ( + (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size + ) + + img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias) + # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio + raw_shape = img.shape[:2] + if self.symmetric_pad: + half_pad = (int((self.output_size[0] - img.shape[0]) / 2), 0) if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
+ if isinstance(self.output_size, (tuple, list)): + # In that case we need to pad because we want to enforce both width and height + if not self.symmetric_pad: + half_pad = (0, 0) + elif self.output_size[0] == img.shape[0]: + half_pad = (0, int((self.output_size[1] - img.shape[1]) / 2)) + # Pad image + img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size) + + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) + if target is not None: + if self.symmetric_pad: + offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1] + + if self.preserve_aspect_ratio: + # Get absolute coords + if target.shape[1:] == (4,): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1] + target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0] + else: + target[:, [0, 2]] *= raw_shape[1] / img.shape[1] + target[:, [1, 3]] *= raw_shape[0] / img.shape[0] + elif target.shape[1:] == (4, 2): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1] + target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0] + else: + target[..., 0] *= raw_shape[1] / img.shape[1] + target[..., 1] *= raw_shape[0] / img.shape[0] + else: + raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") + + return tf.cast(img, dtype=input_dtype), np.clip(target, 0, 1) + + return tf.cast(img, dtype=input_dtype)
-[docs] +[docs] class Normalize(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Normalize + >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- mean: average value per channel std: standard deviation per channel """ + def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) + self.mean = tf.constant(mean) + self.std = tf.constant(std) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std + img -= tf.cast(self.mean, dtype=img.dtype) + img /= tf.cast(self.std, dtype=img.dtype) return img
-[docs] +[docs] class LambdaTransformation(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import LambdaTransformation + >>> transfo = LambdaTransformation(lambda x: x/ 255.) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- fn: the function to be applied to the input tensor """ + def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: self.fn = fn @@ -420,37 +513,42 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class ToGray(NestedObject): """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import ToGray + >>> transfo = ToGray() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) """ + + def __init__(self, num_output_channels: int = 1): + self.num_output_channels = num_output_channels + def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
+ img = tf.image.rgb_to_grayscale(img) + return img if self.num_output_channels == 1 else tf.repeat(img, self.num_output_channels, axis=-1)
-[docs] +[docs] class RandomBrightness(NestedObject): """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta to all pixels - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomBrightness + >>> transfo = RandomBrightness() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] p: probability to apply transformation """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -463,21 +561,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomContrast(NestedObject): """Randomly adjust contrast of a tensor (batch of images or image) by adjusting each pixel: (img - mean) * contrast_factor + mean. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomContrast + >>> transfo = RandomContrast() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) """ - def __init__(self, delta: float = .3) -> None: + + def __init__(self, delta: float = 0.3) -> None: self.delta = delta def extra_repr(self) -> str: @@ -489,21 +588,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomSaturation(NestedObject): """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and increasing saturation by a factor. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomSaturation + >>> transfo = RandomSaturation() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) """ - def __init__(self, delta: float = .5) -> None: + + def __init__(self, delta: float = 0.5) -> None: self.delta = delta def extra_repr(self) -> str: @@ -515,19 +615,20 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomHue(NestedObject): """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHue + >>> transfo = RandomHue() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -540,22 +641,23 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomGamma(NestedObject): """randomly performs gamma correction for a tensor (batch of images or image) - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomGamma + >>> transfo = RandomGamma() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- min_gamma: non-negative real number, lower bound for gamma param max_gamma: non-negative real number, upper bound for gamma min_gain: lower bound for constant multiplier max_gain: upper bound for constant multiplier """ + def __init__( self, min_gamma: float = 0.5, @@ -580,20 +682,21 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomJpegQuality(NestedObject): """Randomly adjust jpeg quality of a 3 dimensional RGB image - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomJpegQuality + >>> transfo = RandomJpegQuality() + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- min_quality: int between [0, 100] max_quality: int between [0, 100] """ + def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: self.min_quality = min_quality self.max_quality = max_quality @@ -602,10 +705,224 @@

Source code for doctr.transforms.modules.tensorflow

return f"min_quality={self.min_quality}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality + return tf.image.random_jpeg_quality(img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality)
+ + + +
+[docs] +class GaussianBlur(NestedObject): + """Randomly adjust jpeg quality of a 3 dimensional RGB image + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianBlur + >>> transfo = GaussianBlur(3, (.1, 5)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + kernel_shape: size of the blurring kernel + std: min and max value of the standard deviation + """ + + def __init__(self, kernel_shape: Union[int, Iterable[int]], std: Tuple[float, float]) -> None: + self.kernel_shape = kernel_shape + self.std = std + + def extra_repr(self) -> str: + return f"kernel_shape={self.kernel_shape}, std={self.std}" + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.squeeze( + _gaussian_filter( + img[tf.newaxis, ...], + kernel_size=self.kernel_shape, + sigma=random.uniform(self.std[0], self.std[1]), + mode="REFLECT", + ), + axis=0, )
+ + +
+[docs] +class ChannelShuffle(NestedObject): + """Randomly shuffle channel order of a given image""" + + def __init__(self): + pass + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.transpose(tf.random.shuffle(tf.transpose(img, perm=[2, 0, 1])), perm=[1, 2, 0])
+ + + +
+[docs] +class GaussianNoise(NestedObject): + """Adds Gaussian Noise to the input tensor + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianNoise + >>> transfo = GaussianNoise(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + mean : mean of the gaussian distribution + std : std of the gaussian distribution + """ + + def __init__(self, mean: float = 0.0, std: float = 1.0) -> None: + super().__init__() + self.std = std + self.mean = mean + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + noise = self.mean + 2 * self.std * tf.random.uniform(x.shape) - self.std + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value(tf.math.round(tf.cast(x, dtype=tf.float32) + 255 * noise), 0, 255), dtype=tf.uint8 + ) + else: + return tf.cast(tf.clip_by_value(x + noise, 0, 1), dtype=x.dtype) + + def extra_repr(self) -> str: + return f"mean={self.mean}, std={self.std}"
+ + + +
+[docs] +class RandomHorizontalFlip(NestedObject): + """Adds random horizontal flip to the input tensor/np.ndarray + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHorizontalFlip + >>> transfo = RandomHorizontalFlip(p=0.5) + >>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1) + >>> target = np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32) + >>> out = transfo(image, target) + + Args: + ---- + p : probability of Horizontal Flip + """ + + def __init__(self, p: float) -> None: + super().__init__() + self.p = p + + def __call__(self, img: Union[tf.Tensor, np.ndarray], target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + _img = tf.image.flip_left_right(img) + _target = target.copy() + # Changing the relative bbox coordinates + if target.shape[1:] == (4,): + _target[:, ::2] = 1 - target[:, [2, 0]] + else: + _target[..., 0] = 1 - target[..., 0] + return _img, _target + return img, target
+ + + +
+[docs] +class RandomShadow(NestedObject): + """Adds random shade to the input image + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomShadow + >>> transfo = RandomShadow(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + opacity_range : minimum and maximum opacity of the shade + """ + + def __init__(self, opacity_range: Optional[Tuple[float, float]] = None) -> None: + super().__init__() + self.opacity_range = opacity_range if isinstance(opacity_range, tuple) else (0.2, 0.8) + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value( + tf.math.round(255 * random_shadow(tf.cast(x, dtype=tf.float32) / 255, self.opacity_range)), + 0, + 255, + ), + dtype=tf.uint8, + ) + else: + return tf.clip_by_value(random_shadow(x, self.opacity_range), 0, 1) + + def extra_repr(self) -> str: + return f"opacity_range={self.opacity_range}"
+ + + +
+[docs] +class RandomResize(NestedObject): + """Randomly resize the input image and align corresponding targets + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomResize + >>> transfo = RandomResize((0.3, 0.9), preserve_aspect_ratio=True, symmetric_pad=True, p=0.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + scale_range: range of the resizing factor for width and height (independently) + preserve_aspect_ratio: whether to preserve the aspect ratio of the image, + given a float value, the aspect ratio will be preserved with this probability + symmetric_pad: whether to symmetrically pad the image, + given a float value, the symmetric padding will be applied with this probability + p: probability to apply the transformation + """ + + def __init__( + self, + scale_range: Tuple[float, float] = (0.3, 0.9), + preserve_aspect_ratio: Union[bool, float] = False, + symmetric_pad: Union[bool, float] = False, + p: float = 0.5, + ): + super().__init__() + self.scale_range = scale_range + self.preserve_aspect_ratio = preserve_aspect_ratio + self.symmetric_pad = symmetric_pad + self.p = p + self._resize = Resize + + def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + scale_h = random.uniform(*self.scale_range) + scale_w = random.uniform(*self.scale_range) + new_size = (int(img.shape[-3] * scale_h), int(img.shape[-2] * scale_w)) + + _img, _target = self._resize( + new_size, + preserve_aspect_ratio=self.preserve_aspect_ratio + if isinstance(self.preserve_aspect_ratio, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + symmetric_pad=self.symmetric_pad + if isinstance(self.symmetric_pad, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + )(img, target) + + return _img, _target + return img, target + + def extra_repr(self) -> str: + return f"scale_range={self.scale_range}, preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}, p={self.p}" # noqa: E501
+
@@ -638,8 +955,8 @@

Source code for doctr.transforms.modules.tensorflow

- +
+ diff --git a/v0.4.1/_modules/doctr/utils/metrics.html b/v0.4.1/_modules/doctr/utils/metrics.html index 460c64a385..8a37d5949a 100644 --- a/v0.4.1/_modules/doctr/utils/metrics.html +++ b/v0.4.1/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.metrics

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-import cv2
-from typing import List, Tuple, Dict, Optional
-from unidecode import unidecode
+from anyascii import anyascii
 from scipy.optimize import linear_sum_assignment
-from doctr.utils.geometry import rbbox_to_polygon
+from shapely.geometry import Polygon
 
-__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'rbox_to_mask',
-           'nms', 'LocalizationConfusion', 'OCRMetric']
+__all__ = [
+    "TextMatch",
+    "box_iou",
+    "polygon_iou",
+    "nms",
+    "LocalizationConfusion",
+    "OCRMetric",
+    "DetectionMetric",
+]
 
 
 def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
-    """Perform string comparison with multiple levels of tolerance
+    """Performs string comparison with multiple levels of tolerance
 
     Args:
+    ----
         word1: a string
         word2: another string
 
     Returns:
+    -------
         a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
-            unidecode counterparts and their lower-case unidecode counterparts match
+            anyascii counterparts and their lower-case anyascii counterparts match
     """
-    raw_match = (word1 == word2)
-    caseless_match = (word1.lower() == word2.lower())
-    unidecode_match = (unidecode(word1) == unidecode(word2))
+    raw_match = word1 == word2
+    caseless_match = word1.lower() == word2.lower()
+    anyascii_match = anyascii(word1) == anyascii(word2)
 
     # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
-    unicase_match = (unidecode(word1).lower() == unidecode(word2).lower())
+    unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
 
-    return raw_match, caseless_match, unidecode_match, unicase_match
+    return raw_match, caseless_match, anyascii_match, unicase_match
 
 
 
-[docs] +[docs] class TextMatch: - """Implements text match metric (word-level accuracy) for recognition task. + r"""Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \\forall X, Y \\in \\mathcal{W}^N, - TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) + \forall X, Y \in \mathcal{W}^N, + TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \\forall a, x \\in \\mathcal{W}, - f_a(x) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } x = a \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{W}` is the set of all possible character sequences, + \forall a, x \in \mathcal{W}, + f_a(x) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } x = a \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. - Example:: - >>> from doctr.utils import TextMatch - >>> metric = TextMatch() - >>> metric.update(['Hello', 'world'], ['hello', 'world']) - >>> metric.summary() + >>> from doctr.utils import TextMatch + >>> metric = TextMatch() + >>> metric.update(['Hello', 'world'], ['hello', 'world']) + >>> metric.summary() """ def __init__(self) -> None: self.reset() +
+[docs] def update( self, gt: List[str], @@ -354,29 +386,32 @@

Source code for doctr.utils.metrics

         """Update the state of the metric with new predictions
 
         Args:
+        ----
             gt: list of groung-truth character sequences
-            pred: list of predicted character sequences"""
-
+            pred: list of predicted character sequences
+        """
         if len(gt) != len(pred):
             raise AssertionError("prediction size does not match with ground-truth labels size")
 
         for gt_word, pred_word in zip(gt, pred):
-            _raw, _caseless, _unidecode, _unicase = string_match(gt_word, pred_word)
+            _raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
             self.raw += int(_raw)
             self.caseless += int(_caseless)
-            self.unidecode += int(_unidecode)
+            self.anyascii += int(_anyascii)
             self.unicase += int(_unicase)
 
-        self.total += len(gt)
+        self.total += len(gt)
+
-[docs] +[docs] def summary(self) -> Dict[str, float]: """Computes the aggregated metrics - Returns: - a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode - counterpart and its lower-case unidecode counterpart + Returns + ------- + a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii + counterpart and its lower-case anyascii counterpart """ if self.total == 0: raise AssertionError("you need to update the metric before getting the summary") @@ -384,7 +419,7 @@

Source code for doctr.utils.metrics

         return dict(
             raw=self.raw / self.total,
             caseless=self.caseless / self.total,
-            unidecode=self.unidecode / self.total,
+            anyascii=self.anyascii / self.total,
             unicase=self.unicase / self.total,
         )
@@ -392,23 +427,25 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.raw = 0
         self.caseless = 0
-        self.unidecode = 0
+        self.anyascii = 0
         self.unicase = 0
         self.total = 0
def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray: - """Compute the IoU between two sets of bounding boxes + """Computes the IoU between two sets of bounding boxes Args: + ---- boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax) boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax) + Returns: + ------- the IoU matrix of shape (N, M) """ - - iou_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) + iou_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0: l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1) @@ -419,107 +456,54 @@

Source code for doctr.utils.metrics

         right = np.minimum(r1, r2.T)
         bot = np.minimum(b1, b2.T)
 
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
+        intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
         union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
         iou_mat = intersection / union
 
     return iou_mat
 
 
-def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
-    """Compute the IoA (intersection over area) between two sets of bounding boxes:
-    ioa(i, j) = inter(i, j) / area(i)
-
-    Args:
-        boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
-        boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
-    Returns:
-        the IoA matrix of shape (N, M)
-    """
-
-    ioa_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
-
-    if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
-        l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
-        l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
-
-        left = np.maximum(l1, l2.T)
-        top = np.maximum(t1, t2.T)
-        right = np.minimum(r1, r2.T)
-        bot = np.minimum(b1, b2.T)
-
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
-        area = (r1 - l1) * (b1 - t1)
-        ioa_mat = intersection / area
-
-    return ioa_mat
-
-
-def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
-    """Compute the IoU between two sets of boolean masks
+def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
+    """Computes the IoU between two sets of rotated bounding boxes
 
     Args:
-        masks_1: boolean masks of shape (N, H, W)
-        masks_2: boolean masks of shape (M, H, W)
+    ----
+        polys_1: rotated bounding boxes of shape (N, 4, 2)
+        polys_2: rotated bounding boxes of shape (M, 4, 2)
+        mask_shape: spatial shape of the intermediate masks
+        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
 
     Returns:
+    -------
         the IoU matrix of shape (N, M)
     """
+    if polys_1.ndim != 3 or polys_2.ndim != 3:
+        raise AssertionError("expects boxes to be in format (N, 4, 2)")
 
-    if masks_1.shape[1:] != masks_2.shape[1:]:
-        raise AssertionError("both boolean masks should have the same spatial shape")
+    iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
 
-    iou_mat = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
+    shapely_polys_1 = [Polygon(poly) for poly in polys_1]
+    shapely_polys_2 = [Polygon(poly) for poly in polys_2]
 
-    if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
-        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...])
-        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...])
-        axes = tuple(range(2, masks_1.ndim + 1))
-        iou_mat = intersection.sum(axis=axes) / union.sum(axis=axes)
+    for i, poly1 in enumerate(shapely_polys_1):
+        for j, poly2 in enumerate(shapely_polys_2):
+            intersection_area = poly1.intersection(poly2).area
+            union_area = poly1.area + poly2.area - intersection_area
+            iou_mat[i, j] = intersection_area / union_area
 
     return iou_mat
 
 
-def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
-    """Convert boxes to masks
-
-    Args:
-        boxes: rotated bounding boxes of shape (N, 5) in format (x, y, w, h, alpha)
-        shape: spatial shapes of the output masks
-
-    Returns:
-        the boolean masks of shape (N, H, W)
-    """
-
-    masks = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
-
-    if boxes.shape[0] > 0:
-        # Get absolute coordinates
-        if boxes.dtype != np.int:
-            abs_boxes = boxes.copy()
-            abs_boxes[:, [0, 2]] = abs_boxes[:, [0, 2]] * shape[1]
-            abs_boxes[:, [1, 3]] = abs_boxes[:, [1, 3]] * shape[0]
-            abs_boxes = abs_boxes.round().astype(np.int)
-        else:
-            abs_boxes = boxes
-            abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
-
-        # TODO: optimize slicing to improve vectorization
-        for idx, _box in enumerate(abs_boxes):
-            box = rbbox_to_polygon(_box)
-            cv2.fillPoly(masks[idx], [np.array(box, np.int32)], 1)
-
-    return masks.astype(bool)
-
-
-def nms(boxes: np.ndarray, thresh: float = .5) -> List[int]:
+def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
     """Perform non-max suppression, borrowed from <https://github.com/rbgirshick/fast-rcnn>`_.
 
     Args:
+    ----
         boxes: np array of straight boxes: (*, 5), (xmin, ymin, xmax, ymax, score)
         thresh: iou threshold to perform box suppression.
 
     Returns:
+    -------
         A list of box indexes to keep
     """
     x1 = boxes[:, 0]
@@ -551,66 +535,71 @@ 

Source code for doctr.utils.metrics

 
 
 
-[docs] +[docs] class LocalizationConfusion: - """Implements common confusion metrics and mean IoU for localization evaluation. + r"""Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ - Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) + \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ + Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ + Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ + meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \\forall y \\in \\mathcal{B}, - g_X(y) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, + \forall y \in \mathcal{B}, + g_X(y) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import LocalizationConfusion - >>> metric = LocalizationConfusion(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import LocalizationConfusion + >>> metric = LocalizationConfusion(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: + """Updates the metric + Args: + ---- + gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + """ if preds.shape[0] > 0: # Compute IoU - if self.rotated_bbox: - mask_gts = rbox_to_mask(gts, shape=self.mask_shape) - mask_preds = rbox_to_mask(preds, shape=self.mask_shape) - iou_mat = mask_iou(mask_gts, mask_preds) + if self.use_polygons: + iou_mat = polygon_iou(gts, preds) else: iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=1).sum()) + self.tot_iou += float(iou_mat.max(axis=0).sum()) # Assign pairs gt_indices, pred_indices = linear_sum_assignment(-iou_mat) @@ -618,17 +607,18 @@

Source code for doctr.utils.metrics

 
         # Update counts
         self.num_gts += gts.shape[0]
-        self.num_preds += preds.shape[0]
+        self.num_preds += preds.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: """Computes the aggregated metrics - Returns: + Returns + ------- a tuple with the recall, precision and meanIoU scores """ - # Recall recall = self.matches / self.num_gts if self.num_gts > 0 else None @@ -636,7 +626,7 @@

Source code for doctr.utils.metrics

         precision = self.matches / self.num_preds if self.num_preds > 0 else None
 
         # mean IoU
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -645,64 +635,65 @@

Source code for doctr.utils.metrics

         self.num_gts = 0
         self.num_preds = 0
         self.matches = 0
-        self.tot_iou = 0.
+ self.tot_iou = 0.0
-[docs] +[docs] class OCRMetric: - """Implements end-to-end OCR metric. + r"""Implements an end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, - \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ - Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) + \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, + \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ + Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, - h_{B,L}(b, l) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ - & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, - :math:`\\mathcal{L}` is the set of possible character sequences, + \forall (b, l) \in \mathcal{B} \times \mathcal{L}, + h_{B,L}(b, l) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import OCRMetric - >>> metric = OCRMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - ['hello'], ['hello', 'world']) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import OCRMetric + >>> metric = OCRMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> ['hello'], ['hello', 'world']) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update( self, gt_boxes: np.ndarray, @@ -710,50 +701,58 @@

Source code for doctr.utils.metrics

         gt_labels: List[str],
         pred_labels: List[str],
     ) -> None:
+        """Updates the metric
 
+        Args:
+        ----
+            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
+            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
+            gt_labels: a list of N string labels
+            pred_labels: a list of M string labels
+        """
         if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
-            raise AssertionError("there should be the same number of boxes and string both for the ground truth "
-                                 "and the predictions")
+            raise AssertionError(
+                "there should be the same number of boxes and string both for the ground truth and the predictions"
+            )
 
         # Compute IoU
         if pred_boxes.shape[0] > 0:
-            if self.rotated_bbox:
-                mask_gts = rbox_to_mask(gt_boxes, shape=self.mask_shape)
-                mask_preds = rbox_to_mask(pred_boxes, shape=self.mask_shape)
-                iou_mat = mask_iou(mask_gts, mask_preds)
+            if self.use_polygons:
+                iou_mat = polygon_iou(gt_boxes, pred_boxes)
             else:
                 iou_mat = box_iou(gt_boxes, pred_boxes)
 
-            self.tot_iou += float(iou_mat.max(axis=1).sum())
+            self.tot_iou += float(iou_mat.max(axis=0).sum())
 
             # Assign pairs
             gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
             is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
             # String comparison
             for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
-                _raw, _caseless, _unidecode, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
+                _raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
                 self.raw_matches += int(_raw)
                 self.caseless_matches += int(_caseless)
-                self.unidecode_matches += int(_unidecode)
+                self.anyascii_matches += int(_anyascii)
                 self.unicase_matches += int(_unicase)
 
         self.num_gts += gt_boxes.shape[0]
-        self.num_preds += pred_boxes.shape[0]
+        self.num_preds += pred_boxes.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float]], Optional[float]]: """Computes the aggregated metrics - Returns: - a tuple with the recall & precision for each string comparison flexibility and the mean IoU + Returns + ------- + a tuple with the recall & precision for each string comparison and the mean IoU """ - # Recall recall = dict( raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None, caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None, - unidecode=self.unidecode_matches / self.num_gts if self.num_gts > 0 else None, + anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None, unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None, ) @@ -761,12 +760,12 @@

Source code for doctr.utils.metrics

         precision = dict(
             raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
             caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
-            unidecode=self.unidecode_matches / self.num_preds if self.num_preds > 0 else None,
+            anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
             unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
         )
 
         # mean IoU (overall detected boxes)
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -774,12 +773,136 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.num_gts = 0
         self.num_preds = 0
-        self.tot_iou = 0.
+        self.tot_iou = 0.0
         self.raw_matches = 0
         self.caseless_matches = 0
-        self.unidecode_matches = 0
+        self.anyascii_matches = 0
         self.unicase_matches = 0
+ + +
+[docs] +class DetectionMetric: + r"""Implements an object detection metric. + + The aggregated metrics are computed as follows: + + .. math:: + \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, + \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ + Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + + with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and + :math:`y`, and the function :math:`h_{B, C}` defined as: + + .. math:: + \forall (b, c) \in \mathcal{B} \times \mathcal{C}, + h_{B,C}(b, c) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{C}` is the set of possible class indices, + :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. + + >>> import numpy as np + >>> from doctr.utils import DetectionMetric + >>> metric = DetectionMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) + >>> metric.summary() + + Args: + ---- + iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format + """ + + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: + self.iou_thresh = iou_thresh + self.use_polygons = use_polygons + self.reset() + +
+[docs] + def update( + self, + gt_boxes: np.ndarray, + pred_boxes: np.ndarray, + gt_labels: np.ndarray, + pred_labels: np.ndarray, + ) -> None: + """Updates the metric + + Args: + ---- + gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + gt_labels: an array of class indices of shape (N,) + pred_labels: an array of class indices of shape (M,) + """ + if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: + raise AssertionError( + "there should be the same number of boxes and string both for the ground truth and the predictions" + ) + + # Compute IoU + if pred_boxes.shape[0] > 0: + if self.use_polygons: + iou_mat = polygon_iou(gt_boxes, pred_boxes) + else: + iou_mat = box_iou(gt_boxes, pred_boxes) + + self.tot_iou += float(iou_mat.max(axis=0).sum()) + + # Assign pairs + gt_indices, pred_indices = linear_sum_assignment(-iou_mat) + is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh + # Category comparison + self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) + + self.num_gts += gt_boxes.shape[0] + self.num_preds += pred_boxes.shape[0]
+ + +
+[docs] + def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: + """Computes the aggregated metrics + + Returns + ------- + a tuple with the recall & precision for each class prediction and the mean IoU + """ + # Recall + recall = self.num_matches / self.num_gts if self.num_gts > 0 else None + + # Precision + precision = self.num_matches / self.num_preds if self.num_preds > 0 else None + + # mean IoU (overall detected boxes) + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None + + return recall, precision, mean_iou
+ + + def reset(self) -> None: + self.num_gts = 0 + self.num_preds = 0 + self.tot_iou = 0.0 + self.num_matches = 0
+
@@ -812,8 +935,8 @@

Source code for doctr.utils.metrics

       
     
   
- - + + diff --git a/v0.4.1/_modules/doctr/utils/visualization.html b/v0.4.1/_modules/doctr/utils/visualization.html index 8e7dcca811..c818be6d7b 100644 --- a/v0.4.1/_modules/doctr/utils/visualization.html +++ b/v0.4.1/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.visualization

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import colorsys
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import matplotlib.pyplot as plt
-from matplotlib.figure import Figure
+import cv2
 import matplotlib.patches as patches
-import mplcursors
-from PIL import ImageFont, ImageDraw, Image
+import matplotlib.pyplot as plt
 import numpy as np
-import cv2
-from typing import Tuple, List, Dict, Any, Union
+from matplotlib.figure import Figure
 
-from .common_types import BoundingBox, RotatedBbox
+from .common_types import BoundingBox, Polygon4P
 
-__all__ = ['visualize_page', 'synthetize_page']
+__all__ = ["visualize_page", "visualize_kie_page", "draw_boxes"]
 
 
-def create_rect_patch(
-    geometry: Union[BoundingBox, RotatedBbox],
-    label: str,
+def rect_patch(
+    geometry: BoundingBox,
     page_dimensions: Tuple[int, int],
-    color: Tuple[int, int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
     alpha: float = 0.3,
     linewidth: int = 2,
     fill: bool = True,
-) -> patches.Patch:
-    """Create a matplotlib patch (rectangle) bounding the element
+    preserve_aspect_ratio: bool = False,
+) -> patches.Rectangle:
+    """Create a matplotlib rectangular patch for the element
 
     Args:
+    ----
         geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
         label: label to display when hovered
-        page_dimensions: dimensions of the Page
         color: color to draw box
         alpha: opacity parameter to fill the boxes, 0 = transparent
         linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
 
     Returns:
+    -------
         a rectangular Patch
     """
+    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
     height, width = page_dimensions
-    if len(geometry) == 5:
-        x, y, w, h, a = geometry  # type: ignore[misc]
-        x, w = x * width, w * width
-        y, h = y * height, h * height
-        points = cv2.boxPoints(((x, y), (w, h), a))
-        return patches.Polygon(
-            points,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
-    else:
-        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
-        xmin, xmax = xmin * width, xmax * width
-        ymin, ymax = ymin * height, ymax * height
-        return patches.Rectangle(
-            (xmin, ymin),
-            xmax - xmin,
-            ymax - ymin,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
+    (xmin, ymin), (xmax, ymax) = geometry
+    # Switch to absolute coords
+    if preserve_aspect_ratio:
+        width = height = max(height, width)
+    xmin, w = xmin * width, (xmax - xmin) * width
+    ymin, h = ymin * height, (ymax - ymin) * height
+
+    return patches.Rectangle(
+        (xmin, ymin),
+        w,
+        h,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def polygon_patch(
+    geometry: np.ndarray,
+    page_dimensions: Tuple[int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
+    alpha: float = 0.3,
+    linewidth: int = 2,
+    fill: bool = True,
+    preserve_aspect_ratio: bool = False,
+) -> patches.Polygon:
+    """Create a matplotlib polygon patch for the element
+
+    Args:
+    ----
+        geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
+        label: label to display when hovered
+        color: color to draw box
+        alpha: opacity parameter to fill the boxes, 0 = transparent
+        linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
+
+    Returns:
+    -------
+        a polygon Patch
+    """
+    if not geometry.shape == (4, 2):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
+    height, width = page_dimensions
+    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
+    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
+
+    return patches.Polygon(
+        geometry,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def create_obj_patch(
+    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
+    page_dimensions: Tuple[int, int],
+    **kwargs: Any,
+) -> patches.Patch:
+    """Create a matplotlib patch for the element
+
+    Args:
+    ----
+        geometry: bounding box (straight or rotated) of the element
+        page_dimensions: dimensions of the page in format (height, width)
+        **kwargs: keyword arguments for the patch
+
+    Returns:
+    -------
+        a matplotlib Patch
+    """
+    if isinstance(geometry, tuple):
+        if len(geometry) == 2:  # straight word BB (2 pts)
+            return rect_patch(geometry, page_dimensions, **kwargs)
+        elif len(geometry) == 4:  # rotated word BB (4 pts)
+            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)
+    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
+        return polygon_patch(geometry, page_dimensions, **kwargs)
+    raise ValueError("invalid geometry format")
+
+
+def get_colors(num_colors: int) -> List[Tuple[float, float, float]]:
+    """Generate num_colors color for matplotlib
+
+    Args:
+    ----
+        num_colors: number of colors to generate
+
+    Returns:
+    -------
+        colors: list of generated colors
+    """
+    colors = []
+    for i in np.arange(0.0, 360.0, 360.0 / num_colors):
+        hue = i / 360.0
+        lightness = (50 + np.random.rand() * 10) / 100.0
+        saturation = (90 + np.random.rand() * 10) / 100.0
+        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
+    return colors
 
 
 
-[docs] +[docs] def visualize_page( page: Dict[str, Any], image: np.ndarray, @@ -359,18 +472,18 @@

Source code for doctr.utils.visualization

 ) -> Figure:
     """Visualize a full page with predicted blocks, lines and words
 
-    Example::
-        >>> import numpy as np
-        >>> import matplotlib.pyplot as plt
-        >>> from doctr.utils.visualization import visualize_page
-        >>> from doctr.models import ocr_db_crnn
-        >>> model = ocr_db_crnn(pretrained=True)
-        >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
-        >>> out = model([[input_page]])
-        >>> visualize_page(out[0].pages[0].export(), input_page)
-        >>> plt.show()
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from doctr.utils.visualization import visualize_page
+    >>> from doctr.models import ocr_db_crnn
+    >>> model = ocr_db_crnn(pretrained=True)
+    >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
+    >>> out = model([[input_page]])
+    >>> visualize_page(out[0].pages[0].export(), input_page)
+    >>> plt.show()
 
     Args:
+    ----
         page: the exported Page of a Document
         image: np array of the page, needs to have the same shape than page['dimensions']
         words_only: whether only words should be displayed
@@ -378,6 +491,11 @@ 

Source code for doctr.utils.visualization

         scale: figsize of the largest windows side
         interactive: whether the plot should be interactive
         add_labels: for static plot, adds text labels on top of bounding box
+        **kwargs: keyword arguments for the polygon patch
+
+    Returns:
+    -------
+        the matplotlib figure
     """
     # Get proper scale and aspect ratio
     h, w = image.shape[:2]
@@ -386,128 +504,189 @@ 

Source code for doctr.utils.visualization

     # Display the image
     ax.imshow(image)
     # hide both axis
-    ax.axis('off')
+    ax.axis("off")
 
     if interactive:
         artists: List[patches.Patch] = []  # instantiate an empty list of patches (to be drawn on the page)
 
-    for block in page['blocks']:
+    for block in page["blocks"]:
         if not words_only:
-            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
+            rect = create_obj_patch(
+                block["geometry"], page["dimensions"], label="block", color=(0, 1, 0), linewidth=1, **kwargs
+            )
             # add patch on figure
             ax.add_patch(rect)
             if interactive:
                 # add patch to cursor's artists
                 artists.append(rect)
 
-        for line in block['lines']:
+        for line in block["lines"]:
             if not words_only:
-                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
+                rect = create_obj_patch(
+                    line["geometry"], page["dimensions"], label="line", color=(1, 0, 0), linewidth=1, **kwargs
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
-            for word in line['words']:
-                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
-                                         page['dimensions'], (0, 0, 1), **kwargs)
+            for word in line["words"]:
+                rect = create_obj_patch(
+                    word["geometry"],
+                    page["dimensions"],
+                    label=f"{word['value']} (confidence: {word['confidence']:.2%})",
+                    color=(0, 0, 1),
+                    **kwargs,
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
                 elif add_labels:
-                    if len(word['geometry']) == 5:
+                    if len(word["geometry"]) == 5:
                         text_loc = (
-                            int(page['dimensions'][1] * (word['geometry'][0] - word['geometry'][2] / 2)),
-                            int(page['dimensions'][0] * (word['geometry'][1] - word['geometry'][3] / 2))
+                            int(page["dimensions"][1] * (word["geometry"][0] - word["geometry"][2] / 2)),
+                            int(page["dimensions"][0] * (word["geometry"][1] - word["geometry"][3] / 2)),
                         )
                     else:
                         text_loc = (
-                            int(page['dimensions'][1] * word['geometry'][0][0]),
-                            int(page['dimensions'][0] * word['geometry'][0][1])
+                            int(page["dimensions"][1] * word["geometry"][0][0]),
+                            int(page["dimensions"][0] * word["geometry"][0][1]),
+                        )
+
+                    if len(word["geometry"]) == 2:
+                        # We draw only if boxes are in straight format
+                        ax.text(
+                            *text_loc,
+                            word["value"],
+                            size=10,
+                            alpha=0.5,
+                            color=(0, 0, 1),
                         )
-                    ax.text(
-                        *text_loc,
-                        word['value'],
-                        size=10,
-                        alpha=0.5,
-                        color=(0, 0, 1),
-                    )
 
         if display_artefacts:
-            for artefact in block['artefacts']:
-                rect = create_rect_patch(
-                    artefact['geometry'],
-                    'artefact',
-                    page['dimensions'],
-                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
+            for artefact in block["artefacts"]:
+                rect = create_obj_patch(
+                    artefact["geometry"],
+                    page["dimensions"],
+                    label="artefact",
+                    color=(0.5, 0.5, 0.5),
                     linewidth=1,
-                    **kwargs
+                    **kwargs,
                 )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
     if interactive:
+        import mplcursors
+
         # Create mlp Cursor to hover patches in artists
         mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
-    fig.tight_layout(pad=0.)
+    fig.tight_layout(pad=0.0)
 
     return fig
-def synthetize_page( +def visualize_kie_page( page: Dict[str, Any], - draw_proba: bool = False, - font_size: int = 13, -) -> np.ndarray: - """Draw a the content of the element page (OCR response) on a blank page. + image: np.ndarray, + words_only: bool = False, + display_artefacts: bool = True, + scale: float = 10, + interactive: bool = True, + add_labels: bool = True, + **kwargs: Any, +) -> Figure: + """Visualize a full page with predicted blocks, lines and words + + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from doctr.utils.visualization import visualize_page + >>> from doctr.models import ocr_db_crnn + >>> model = ocr_db_crnn(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([[input_page]]) + >>> visualize_kie_page(out[0].pages[0].export(), input_page) + >>> plt.show() Args: - page: exported Page object to represent - draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 - font_size: size of the font, default font = 13 + ---- + page: the exported Page of a Document + image: np array of the page, needs to have the same shape than page['dimensions'] + words_only: whether only words should be displayed + display_artefacts: whether artefacts should be displayed + scale: figsize of the largest windows side + interactive: whether the plot should be interactive + add_labels: for static plot, adds text labels on top of bounding box + **kwargs: keyword arguments for the polygon patch - Return: - A np array (drawn page) + Returns: + ------- + the matplotlib figure """ - # Draw template - h, w = page["dimensions"] - response = 255 * np.ones((h, w, 3), dtype=np.int32) + # Get proper scale and aspect ratio + h, w = image.shape[:2] + size = (scale * w / h, scale) if h > w else (scale, h / w * scale) + fig, ax = plt.subplots(figsize=size) + # Display the image + ax.imshow(image) + # hide both axis + ax.axis("off") - # Draw each word - for block in page["blocks"]: - for line in block["lines"]: - for word in line["words"]: - # Get aboslute word geometry - (xmin, ymin), (xmax, ymax) = word["geometry"] - xmin, xmax = int(w * xmin), int(w * xmax) - ymin, ymax = int(h * ymin), int(h * ymax) - - # White drawing context adapted to font size, 0.75 factor to convert pts --> pix - h_box, w_box = ymax - ymin, xmax - xmin - h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75)) - img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255)) - d = ImageDraw.Draw(img) - - # Draw in black the value of the word - d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0)) - - # Resize back to box size - img = img.resize((w_box, h_box), Image.NEAREST) - - # Colorize if draw_proba - if draw_proba: - p = int(255 * word["confidence"]) - mask = np.where(np.array(img) == 0, 1, 0) - proba = np.array([255 - p, 0, p]) - color = mask * proba[np.newaxis, np.newaxis, :] - white_mask = 255 * (1 - mask) - img = color + white_mask - - # Write to response page - response[ymin:ymax, xmin:xmax, :] = np.array(img) - - return response + if interactive: + artists: List[patches.Patch] = [] # instantiate an empty list of patches (to be drawn on the page) + + colors = {k: color for color, k in zip(get_colors(len(page["predictions"])), page["predictions"])} + for key, value in page["predictions"].items(): + for prediction in value: + if not words_only: + rect = create_obj_patch( + prediction["geometry"], + page["dimensions"], + label=f"{key} \n {prediction['value']} (confidence: {prediction['confidence']:.2%}", + color=colors[key], + linewidth=1, + **kwargs, + ) + # add patch on figure + ax.add_patch(rect) + if interactive: + # add patch to cursor's artists + artists.append(rect) + + if interactive: + import mplcursors + + # Create mlp Cursor to hover patches in artists + mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label())) + fig.tight_layout(pad=0.0) + + return fig + + +def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None: + """Draw an array of relative straight boxes on an image + + Args: + ---- + boxes: array of relative boxes, of shape (*, 4) + image: np array, float32 or uint8 + color: color to use for bounding box edges + **kwargs: keyword arguments from `matplotlib.pyplot.plot` + """ + h, w = image.shape[:2] + # Convert boxes to absolute coords + _boxes = deepcopy(boxes) + _boxes[:, [0, 2]] *= w + _boxes[:, [1, 3]] *= h + _boxes = _boxes.astype(np.int32) + for box in _boxes.tolist(): + xmin, ymin, xmax, ymax = box + image = cv2.rectangle( + image, (xmin, ymin), (xmax, ymax), color=color if isinstance(color, tuple) else (0, 0, 255), thickness=2 + ) + plt.imshow(image) + plt.plot(**kwargs)
@@ -540,8 +719,8 @@

Source code for doctr.utils.visualization

       
     
   
- - + + diff --git a/v0.4.1/_modules/index.html b/v0.4.1/_modules/index.html index e86abcd4d4..5793c44f20 100644 --- a/v0.4.1/_modules/index.html +++ b/v0.4.1/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -225,20 +225,42 @@ - - + + diff --git a/v0.4.1/_sources/changelog.rst.txt b/v0.4.1/_sources/changelog.rst.txt index 430097d6c8..35befe7b96 100644 --- a/v0.4.1/_sources/changelog.rst.txt +++ b/v0.4.1/_sources/changelog.rst.txt @@ -1,6 +1,54 @@ Changelog ========= +v0.10.0 (2024-10-21) +------------------- +Release note: `v0.10.0 `_ + +v0.9.0 (2024-08-08) +------------------- +Release note: `v0.9.0 `_ + +v0.8.1 (2024-03-04) +------------------- +Release note: `v0.8.1 `_ + +v0.8.0 (2024-02-28) +------------------- +Release note: `v0.8.0 `_ + +v0.7.0 (2023-09-09) +------------------- +Release note: `v0.7.0 `_ + +v0.6.0 (2022-09-29) +------------------- +Release note: `v0.6.0 `_ + +v0.5.1 (2022-03-22) +------------------- +Release note: `v0.5.1 `_ + +v0.5.0 (2021-12-31) +------------------- +Release note: `v0.5.0 `_ + +v0.4.1 (2021-11-22) +------------------- +Release note: `v0.4.1 `_ + +v0.4.0 (2021-10-01) +------------------- +Release note: `v0.4.0 `_ + +v0.3.1 (2021-08-27) +------------------- +Release note: `v0.3.1 `_ + +v0.3.0 (2021-07-02) +------------------- +Release note: `v0.3.0 `_ + v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.4.1/_sources/datasets.rst.txt b/v0.4.1/_sources/datasets.rst.txt deleted file mode 100644 index 354122f1e5..0000000000 --- a/v0.4.1/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.datasets.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.4.1/_sources/documents.rst.txt b/v0.4.1/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.4.1/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.4.1/_sources/getting_started/installing.rst.txt b/v0.4.1/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/v0.4.1/_sources/getting_started/installing.rst.txt +++ b/v0.4.1/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/v0.4.1/_sources/index.rst.txt b/v0.4.1/_sources/index.rst.txt index fc3ff89fdf..53251db142 100644 --- a/v0.4.1/_sources/index.rst.txt +++ b/v0.4.1/_sources/index.rst.txt @@ -1,7 +1,8 @@ -DocTR: Document Text Recognition -================================ +******************************** +docTR: Document Text Recognition +******************************** -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -9,38 +10,29 @@ State-of-the-art Optical Character Recognition made seamless & accessible to any DocTR provides an easy and powerful way to extract valuable information from your documents: -* |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. +* |:receipt:| **for automation**: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. -Welcome to the documentation of `DocTR `_! - - Main Features ------------- * |:robot:| Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor -* |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract +* |:rocket:| State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, small dependencies -* |:tools:| Daily maintained -* |:factory:| Easy integration - +* |:bird:| Light package, minimal dependencies +* |:tools:| Actively maintained by Mindee +* |:factory:| Easy integration (available templates for browser demo & API deployment) -Getting Started ---------------- .. toctree:: :maxdepth: 2 + :caption: Getting started + :hidden: - installing - - -Build & train your predictor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) -* Fine-tune or train from scratch any detection or recognition model to specialize on your data + getting_started/installing + notebooks Model zoo @@ -48,36 +40,83 @@ Model zoo Text detection models """"""""""""""""""""" - * `DBNet `_ (Differentiable Binarization) - * `LinkNet `_ +* DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ +* LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ +* FAST from `"FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation" `_ Text recognition models """"""""""""""""""""""" - * `SAR `_ (Show, Attend and Read) - * `CRNN `_ (Convolutional Recurrent Neural Network) - * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) +* SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ +* CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ +* MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ +* ViTSTR from `"Vision Transformer for Fast and Efficient Scene Text Recognition" `_ +* PARSeq from `"Scene Text Recognition with Permuted Autoregressive Sequence Models" `_ Supported datasets ^^^^^^^^^^^^^^^^^^ - * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. - * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. - * SROIE from `ICDAR 2019 `_. +* FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. +* CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. +* SROIE from `ICDAR 2019 `_. +* IIIT-5k from `CVIT `_. +* Street View Text from `"End-to-End Scene Text Recognition" `_. +* SynthText from `Visual Geometry Group `_. +* SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. +* IC03 from `ICDAR 2003 `_. +* IC13 from `ICDAR 2013 `_. +* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. +* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" `_. +* IIITHWS from `"Generating Synthetic Data for Text Recognition" `_. +* WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" `_. .. toctree:: :maxdepth: 2 - :caption: Notes + :caption: Using docTR + :hidden: - changelog + using_doctr/using_models + using_doctr/using_datasets + using_doctr/using_contrib_modules + using_doctr/sharing_models + using_doctr/using_model_export + using_doctr/custom_models_training + using_doctr/running_on_aws + + +.. toctree:: + :maxdepth: 2 + :caption: Community + :hidden: + + community/resources .. toctree:: :maxdepth: 2 :caption: Package Reference + :hidden: - datasets - documents - models - transforms - utils + modules/contrib + modules/datasets + modules/io + modules/models + modules/transforms + modules/utils + + +.. toctree:: + :maxdepth: 2 + :caption: Contributing + :hidden: + + contributing/code_of_conduct + contributing/contributing + + +.. toctree:: + :maxdepth: 2 + :caption: Notes + :hidden: + + changelog diff --git a/v0.4.1/_sources/installing.rst.txt b/v0.4.1/_sources/installing.rst.txt deleted file mode 100644 index 5c8779dc1c..0000000000 --- a/v0.4.1/_sources/installing.rst.txt +++ /dev/null @@ -1,46 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: - -* TensorFlow: `installation page `_. -* PyTorch: `installation page `_. - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.4.1/_sources/models.rst.txt b/v0.4.1/_sources/models.rst.txt deleted file mode 100644 index 9830c6c153..0000000000 --- a/v0.4.1/_sources/models.rst.txt +++ /dev/null @@ -1,215 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet16 - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 -.. autofunction:: doctr.models.recognition.master - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.4.1/_sources/transforms.rst.txt b/v0.4.1/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.4.1/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.4.1/_sources/utils.rst.txt b/v0.4.1/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.4.1/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.4.1/_static/basic.css b/v0.4.1/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.4.1/_static/basic.css +++ b/v0.4.1/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.4.1/_static/doctools.js b/v0.4.1/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.4.1/_static/doctools.js +++ b/v0.4.1/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.4.1/_static/documentation_options.js b/v0.4.1/_static/documentation_options.js index a7b5cbe04a..4f656fdbea 100644 --- a/v0.4.1/_static/documentation_options.js +++ b/v0.4.1/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.3.0a0-git', + VERSION: '0.10.1a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.4.1/_static/language_data.js b/v0.4.1/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.4.1/_static/language_data.js +++ b/v0.4.1/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.4.1/_static/searchtools.js b/v0.4.1/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.4.1/_static/searchtools.js +++ b/v0.4.1/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.4.1/changelog.html b/v0.4.1/changelog.html index eafac3a877..fc45a50384 100644 --- a/v0.4.1/changelog.html +++ b/v0.4.1/changelog.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + Changelog - docTR documentation @@ -226,20 +226,42 @@ + diff --git a/v0.4.1/community/resources.html b/v0.4.1/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.4.1/community/resources.html +++ b/v0.4.1/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.4.1/contributing/code_of_conduct.html b/v0.4.1/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/v0.4.1/contributing/code_of_conduct.html +++ b/v0.4.1/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/v0.4.1/contributing/contributing.html b/v0.4.1/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/v0.4.1/contributing/contributing.html +++ b/v0.4.1/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/v0.4.1/datasets.html b/v0.4.1/datasets.html deleted file mode 100644 index 193e576c57..0000000000 --- a/v0.4.1/datasets.html +++ /dev/null @@ -1,578 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
  • sos – optional encoding of Start Of String

  • -
  • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/documents.html b/v0.4.1/documents.html deleted file mode 100644 index 98cbb2c5ef..0000000000 --- a/v0.4.1/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/genindex.html b/v0.4.1/genindex.html index a19b433943..21520455b4 100644 --- a/v0.4.1/genindex.html +++ b/v0.4.1/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -224,20 +224,42 @@

+
+

U

+ + +
+
+

V

@@ -561,7 +711,13 @@

V

W

+
@@ -599,8 +755,8 @@

W

- - + + diff --git a/v0.4.1/getting_started/installing.html b/v0.4.1/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/v0.4.1/getting_started/installing.html +++ b/v0.4.1/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/v0.4.1/index.html b/v0.4.1/index.html index 4c6a28c66a..3a06afc6d9 100644 --- a/v0.4.1/index.html +++ b/v0.4.1/index.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + docTR documentation @@ -226,20 +226,42 @@
-

DocTR: Document Text Recognition

-

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta)

+

docTR: Document Text Recognition

+

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch

https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png

DocTR provides an easy and powerful way to extract valuable information from your documents:

    -
  • 🧾 for automation: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • +
  • 🧾 for automation: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • 👩‍🔬 for research: quickly compare your own architectures speed & performances with state-of-art models on public datasets.

-

Welcome to the documentation of DocTR!

Main Features

  • 🤖 Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters

  • ⚡ User-friendly, 3 lines of code to load a document and extract text with a predictor

  • -
  • 🚀 State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract

  • +
  • 🚀 State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract

  • ⚡ Optimized for inference speed on both CPU & GPU

  • -
  • 🐦 Light package, small dependencies

  • -
  • 🛠️ Daily maintained

  • -
  • 🏭 Easy integration

  • +
  • 🐦 Light package, minimal dependencies

  • +
  • 🛠️ Actively maintained by Mindee

  • +
  • 🏭 Easy integration (available templates for browser demo & API deployment)

-
-
-

Getting Started

-
-

Build & train your predictor

-
    -
  • Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained)

  • -
  • Fine-tune or train from scratch any detection or recognition model to specialize on your data

  • -
-

Model zoo

Text detection models

-
-

Text recognition models

-
-

Supported datasets

-
-
+
+
+
+
+
@@ -406,7 +381,7 @@

Supported datasets - +
Next @@ -446,10 +421,8 @@

Supported datasets + diff --git a/v0.4.1/installing.html b/v0.4.1/installing.html deleted file mode 100644 index b61c60134b..0000000000 --- a/v0.4.1/installing.html +++ /dev/null @@ -1,395 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

- -

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/models.html b/v0.4.1/models.html deleted file mode 100644 index b5cd44c9fa..0000000000 --- a/v0.4.1/models.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet16
->>> model = linknet16(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
-

MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. -Example:

-
>>> import tensorflow as tf
->>> from doctr.models import master
->>> model = master(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

59.50

62.50

75.30

70.00

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

66.00

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/modules/contrib.html b/v0.4.1/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.4.1/modules/contrib.html +++ b/v0.4.1/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.4.1/modules/datasets.html b/v0.4.1/modules/datasets.html index 456e10b172..380a986793 100644 --- a/v0.4.1/modules/datasets.html +++ b/v0.4.1/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1081,7 +1081,7 @@

Returns:

- + diff --git a/v0.4.1/modules/io.html b/v0.4.1/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/v0.4.1/modules/io.html +++ b/v0.4.1/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/v0.4.1/modules/models.html b/v0.4.1/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/v0.4.1/modules/models.html +++ b/v0.4.1/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/v0.4.1/modules/transforms.html b/v0.4.1/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/v0.4.1/modules/transforms.html +++ b/v0.4.1/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/v0.4.1/modules/utils.html b/v0.4.1/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/v0.4.1/modules/utils.html +++ b/v0.4.1/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/v0.4.1/notebooks.html b/v0.4.1/notebooks.html index f97771aebb..d36539f59e 100644 --- a/v0.4.1/notebooks.html +++ b/v0.4.1/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/v0.4.1/objects.inv b/v0.4.1/objects.inv index a22d2ce821..c1700f291b 100644 Binary files a/v0.4.1/objects.inv and b/v0.4.1/objects.inv differ diff --git a/v0.4.1/py-modindex.html b/v0.4.1/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.4.1/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.4.1/search.html b/v0.4.1/search.html index 73772822d2..d050f5eac7 100644 --- a/v0.4.1/search.html +++ b/v0.4.1/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -226,20 +226,42 @@ - - + + diff --git a/v0.4.1/searchindex.js b/v0.4.1/searchindex.js index 803f4f4bcf..6f154115ab 100644 --- a/v0.4.1/searchindex.js +++ b/v0.4.1/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": 5, "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": 5, "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "2023": [], "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": 3, "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": [], "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "andrej": [], "angl": 2, "ani": [1, 2, 3, 5, 6, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": [], "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "articl": [], "artifici": [], "arxiv": 5, "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": [3, 6], "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "baranovskij": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 3, "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": 2, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [1, 2, 6, 7], "box": [1, 2, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "christian": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": [], "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "cool": [], "coordin": 2, "cord": [1, 3, 5], "core": 7, "corner": [], "correct": 6, "correspond": [4, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": 4, "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "found": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gallagh": [], "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ibrahimov": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": [1, 2], "insult": [], "int": [1, 2, 5, 6, 7], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "jame": [], "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": 4, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [3, 5], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "moscardi": [], "most": 5, "mozilla": [], "multi": 3, "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": [], "netraj": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 3, 6, 7], "none": [1, 2, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 3, "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": 1, "order": [1, 2, 5], "org": 5, "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 5, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [4, 5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "parallel": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "patil": [], "pattern": [], "pdf": [2, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [3, 4], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": [], "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "realli": [], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": 4, "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "roboflow": [], "robust": 3, "root": 1, "rotat": [1, 2], "rotated_bbox": [1, 7], "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sanjin": [], "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": [3, 5], "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [1, 4], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standalon": [], "standard": 6, "start": 1, "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": [], "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unit": [], "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "verma": [], "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "video": [], "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": 4, "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "yugesh": [], "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "21": [], "22": [], "27": [], "28": 0, "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": [], "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "resourc": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/v0.4.1/transforms.html b/v0.4.1/transforms.html deleted file mode 100644 index 85e94d8a76..0000000000 --- a/v0.4.1/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.5)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[Callable[[Any], Any]])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[Callable[[Any], Any]])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: Callable[[Any], Any], p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.4.1/using_doctr/custom_models_training.html b/v0.4.1/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/v0.4.1/using_doctr/custom_models_training.html +++ b/v0.4.1/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/v0.4.1/using_doctr/running_on_aws.html b/v0.4.1/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/v0.4.1/using_doctr/running_on_aws.html +++ b/v0.4.1/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/v0.4.1/using_doctr/sharing_models.html b/v0.4.1/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/v0.4.1/using_doctr/sharing_models.html +++ b/v0.4.1/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/v0.4.1/using_doctr/using_contrib_modules.html b/v0.4.1/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.4.1/using_doctr/using_contrib_modules.html +++ b/v0.4.1/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.4.1/using_doctr/using_datasets.html b/v0.4.1/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/v0.4.1/using_doctr/using_datasets.html +++ b/v0.4.1/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/v0.4.1/using_doctr/using_model_export.html b/v0.4.1/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/v0.4.1/using_doctr/using_model_export.html +++ b/v0.4.1/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/v0.4.1/using_doctr/using_models.html b/v0.4.1/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/v0.4.1/using_doctr/using_models.html +++ b/v0.4.1/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/v0.4.1/utils.html b/v0.4.1/utils.html deleted file mode 100644 index e2f223f06a..0000000000 --- a/v0.4.1/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float | None, float | None, float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/datasets/cord.html b/v0.5.0/_modules/doctr/datasets/cord.html index f98ee6901c..55b0584830 100644 --- a/v0.5.0/_modules/doctr/datasets/cord.html +++ b/v0.5.0/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.cord

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
-from doctr.utils.geometry import fit_rbbox
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['CORD']
+__all__ = ["CORD"]
 
 
 
-[docs] +[docs] class CORD(VisionDataset): """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" <https://openreview.net/pdf?id=SJl3z659UH>`_. - Example:: - >>> from doctr.datasets import CORD - >>> train_set = CORD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0 + :align: center + + >>> from doctr.datasets import CORD + >>> train_set = CORD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip', - '45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_test.zip', - '8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0", + "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8", + "cord_train.zip", + ) + + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0", + "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58", + "cord_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - - # # List images - self.root = os.path.join(self._root, 'image') - self.data: List[Tuple[str, Dict[str, Any]]] = [] + # List images + tmp_root = os.path.join(self.root, "image") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] self.train = train - self.sample_transforms = sample_transforms - for img_path in os.listdir(self.root): + np_dtype = np.float32 + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem _targets = [] - with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f: label = json.load(f) for line in label["valid_line"]: for word in line["words"]: if len(word["text"]) > 0: x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] - if rotated_bbox: - box = list(fit_rbbox(np.array([ - [x[0], y[0]], - [x[1], y[1]], - [x[2], y[2]], - [x[3], y[3]], - ], dtype=np.float32))) + box: Union[List[float], np.ndarray] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + box = np.array( + [ + [x[0], y[0]], + [x[1], y[1]], + [x[2], y[2]], + [x[3], y[3]], + ], + dtype=np_dtype, + ) else: - # Reduce 8 coords to 4 + # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax box = [min(x), min(y), max(x), max(y)] - _targets.append((word['text'], box)) + _targets.append((word["text"], box)) text_targets, box_targets = zip(*_targets) - self.data.append(( - img_path, - dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets) - )) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) + ) + for crop, label in zip(crops, list(text_targets)): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -397,8 +461,8 @@

Source code for doctr.datasets.cord

       
     
   
-
- + + diff --git a/v0.5.0/_modules/doctr/datasets/core.html b/v0.5.0/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.5.0/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.5.0/_modules/doctr/datasets/datasets/tensorflow.html deleted file mode 100644 index a236abd9fe..0000000000 --- a/v0.5.0/_modules/doctr/datasets/datasets/tensorflow.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - - - - - doctr.datasets.datasets.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.datasets.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from typing import List, Any, Tuple
-import tensorflow as tf
-
-from .base import _AbstractDataset, _VisionDataset
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset(_AbstractDataset):
-
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-
-        return img, target
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset, _VisionDataset): - pass
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/datasets/detection.html b/v0.5.0/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/v0.5.0/_modules/doctr/datasets/detection.html +++ b/v0.5.0/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.5.0/_modules/doctr/datasets/doc_artefacts.html b/v0.5.0/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/v0.5.0/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.5.0/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.5.0/_modules/doctr/datasets/funsd.html b/v0.5.0/_modules/doctr/datasets/funsd.html index 35d7ad4cf5..f08612f9fa 100644 --- a/v0.5.0/_modules/doctr/datasets/funsd.html +++ b/v0.5.0/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.funsd

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['FUNSD']
+__all__ = ["FUNSD"]
 
 
 
-[docs] +[docs] class FUNSD(VisionDataset): """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. - Example:: - >>> from doctr.datasets import FUNSD - >>> train_set = FUNSD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0 + :align: center + + >>> from doctr.datasets import FUNSD + >>> train_set = FUNSD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip' - SHA256 = 'c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f' - FILE_NAME = 'funsd.zip' + URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip" + SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f" + FILE_NAME = "funsd.zip" def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + super().__init__( + self.URL, + self.FILE_NAME, + self.SHA256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - super().__init__(self.URL, self.FILE_NAME, self.SHA256, True, **kwargs) self.train = train - self.sample_transforms = sample_transforms + np_dtype = np.float32 # Use the subset - subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data') + subfolder = os.path.join("dataset", "training_data" if train else "testing_data") # # List images - self.root = os.path.join(self._root, subfolder, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + tmp_root = os.path.join(self.root, subfolder, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f: data = json.load(f) - _targets = [(word['text'], word['box']) for block in data['form'] - for word in block['words'] if len(word['text']) > 0] + _targets = [ + (word["text"], word["box"]) + for block in data["form"] + for word in block["words"] + if len(word["text"]) > 0 + ] text_targets, box_targets = zip(*_targets) - if rotated_bbox: - # box_targets: xmin, ymin, xmax, ymax -> x, y, w, h, alpha = 0 - box_targets = [ + if use_polygons: + # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners + box_targets = [ # type: ignore[assignment] [ - (box[0] + box[2]) / 2, (box[1] + box[3]) / 2, box[2] - box[0], box[3] - box[1], 0 - ] for box in box_targets + [box[0], box[1]], + [box[2], box[1]], + [box[2], box[3]], + [box[0], box[3]], + ] + for box in box_targets ] - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=int), labels=text_targets))) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype) + ) + for crop, label in zip(crops, list(text_targets)): + # filter labels with unknown characters + if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -386,8 +453,8 @@

Source code for doctr.datasets.funsd

       
     
   
-
- + + diff --git a/v0.5.0/_modules/doctr/datasets/generator/tensorflow.html b/v0.5.0/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/v0.5.0/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.5.0/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.5.0/_modules/doctr/datasets/ic03.html b/v0.5.0/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/v0.5.0/_modules/doctr/datasets/ic03.html +++ b/v0.5.0/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.5.0/_modules/doctr/datasets/ic13.html b/v0.5.0/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/v0.5.0/_modules/doctr/datasets/ic13.html +++ b/v0.5.0/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.5.0/_modules/doctr/datasets/iiit5k.html b/v0.5.0/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/v0.5.0/_modules/doctr/datasets/iiit5k.html +++ b/v0.5.0/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.5.0/_modules/doctr/datasets/iiithws.html b/v0.5.0/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/v0.5.0/_modules/doctr/datasets/iiithws.html +++ b/v0.5.0/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.5.0/_modules/doctr/datasets/imgur5k.html b/v0.5.0/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/v0.5.0/_modules/doctr/datasets/imgur5k.html +++ b/v0.5.0/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.5.0/_modules/doctr/datasets/loader.html b/v0.5.0/_modules/doctr/datasets/loader.html index d32e6da298..ed80350ef0 100644 --- a/v0.5.0/_modules/doctr/datasets/loader.html +++ b/v0.5.0/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.loader

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import math
-import tensorflow as tf
-import numpy as np
-from typing import Optional
+from typing import Callable, Optional
 
-from .multithreading import multithread_exec
+import numpy as np
+import tensorflow as tf
 
 __all__ = ["DataLoader"]
 
@@ -293,12 +314,13 @@ 

Source code for doctr.datasets.loader

     """Collate multiple elements into batches
 
     Args:
+    ----
         samples: list of N tuples containing M elements
 
     Returns:
+    -------
         Tuple of M sequences contianing N elements each
     """
-
     batch_data = zip(*samples)
 
     tf_data = tuple(tf.stack(elt, axis=0) for elt in batch_data)
@@ -307,23 +329,23 @@ 

Source code for doctr.datasets.loader

 
 
 
-[docs] +[docs] class DataLoader: """Implements a dataset wrapper for fast data loading - Example:: - >>> from doctr.datasets import FUNSD, DataLoader - >>> train_set = CORD(train=True, download=True) - >>> train_loader = DataLoader(train_set, batch_size=32) - >>> train_iter = iter(train_loader) - >>> images, targets = next(train_iter) + >>> from doctr.datasets import CORD, DataLoader + >>> train_set = CORD(train=True, download=True) + >>> train_loader = DataLoader(train_set, batch_size=32) + >>> train_iter = iter(train_loader) + >>> images, targets = next(train_iter) Args: + ---- dataset: the dataset shuffle: whether the samples should be shuffled before passing it to the iterator batch_size: number of elements in each batch drop_last: if `True`, drops the last batch if it isn't full - workers: number of workers to use for data loading + collate_fn: function to merge samples into a batch """ def __init__( @@ -332,17 +354,22 @@

Source code for doctr.datasets.loader

         shuffle: bool = True,
         batch_size: int = 1,
         drop_last: bool = False,
-        workers: Optional[int] = None,
+        collate_fn: Optional[Callable] = None,
     ) -> None:
         self.dataset = dataset
         self.shuffle = shuffle
         self.batch_size = batch_size
         nb = len(self.dataset) / batch_size
         self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
-        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
-        self.workers = workers
+        if collate_fn is None:
+            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
+        else:
+            self.collate_fn = collate_fn
         self.reset()
 
+    def __len__(self) -> int:
+        return self.num_batches
+
     def reset(self) -> None:
         # Updates indices after each epoch
         self._num_yielded = 0
@@ -358,9 +385,9 @@ 

Source code for doctr.datasets.loader

         if self._num_yielded < self.num_batches:
             # Get next indices
             idx = self._num_yielded * self.batch_size
-            indices = self.indices[idx: min(len(self.dataset), idx + self.batch_size)]
+            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
 
-            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.workers)
+            samples = list(map(self.dataset.__getitem__, indices))
 
             batch_data = self.collate_fn(samples)
 
@@ -401,8 +428,8 @@ 

Source code for doctr.datasets.loader

       
     
   
-
- +
+ diff --git a/v0.5.0/_modules/doctr/datasets/mjsynth.html b/v0.5.0/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/v0.5.0/_modules/doctr/datasets/mjsynth.html +++ b/v0.5.0/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.5.0/_modules/doctr/datasets/ocr.html b/v0.5.0/_modules/doctr/datasets/ocr.html index 11297d5952..ce1ed8b0d4 100644 --- a/v0.5.0/_modules/doctr/datasets/ocr.html +++ b/v0.5.0/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.ocr

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple
 
-from .datasets import AbstractDataset
-from doctr.utils.geometry import fit_rbbox
+import numpy as np
 
+from .datasets import AbstractDataset
 
-__all__ = ['OCRDataset']
+__all__ = ["OCRDataset"]
 
 
 
-[docs] +[docs] class OCRDataset(AbstractDataset): """Implements an OCR dataset + >>> from doctr.datasets import OCRDataset + >>> train_set = OCRDataset(img_folder="/path/to/images", + >>> label_file="/path/to/labels.json") + >>> img, target = train_set[0] + Args: + ---- img_folder: local path to image folder (all jpg at the root) label_file: local path to the label file - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) - **kwargs: keyword arguments from `VisionDataset`. + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + **kwargs: keyword arguments from `AbstractDataset`. """ def __init__( self, img_folder: str, label_file: str, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, **kwargs: Any, ) -> None: - - self.sample_transforms = sample_transforms - self.root = img_folder + super().__init__(img_folder, **kwargs) # List images self.data: List[Tuple[str, Dict[str, Any]]] = [] - with open(label_file, 'rb') as f: + np_dtype = np.float32 + with open(label_file, "rb") as f: data = json.load(f) - for file_dic in data: + for img_name, annotations in data.items(): # Get image path - img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg' + img_name = Path(img_name) # File existence check if not os.path.exists(os.path.join(self.root, img_name)): raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") # handle empty images - if (len(file_dic["coordinates"]) == 0 or - (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")): - self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[]))) + if len(annotations["typed_words"]) == 0: + self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[]))) continue - is_valid: List[bool] = [] - box_targets: List[List[float]] = [] - for box in file_dic["coordinates"]: - if rotated_bbox: - x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32)) - box = [x, y, w, h, alpha] - is_valid.append(w > 0 and h > 0) - else: - xs, ys = zip(*box) - box = [min(xs), min(ys), max(xs), max(ys)] - is_valid.append(box[0] < box[2] and box[1] < box[3]) - if is_valid[-1]: - box_targets.append(box) + # Unpack the straight boxes (xmin, ymin, xmax, ymax) + geoms = [list(map(float, obj["geometry"][:4])) for obj in annotations["typed_words"]] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + geoms = [ + [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]] # type: ignore[list-item] + for geom in geoms + ] + + text_targets = [obj["value"] for obj in annotations["typed_words"]] - text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid] - self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
+ self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))
@@ -383,8 +402,8 @@

Source code for doctr.datasets.ocr

       
     
   
- - + + diff --git a/v0.5.0/_modules/doctr/datasets/recognition.html b/v0.5.0/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/v0.5.0/_modules/doctr/datasets/recognition.html +++ b/v0.5.0/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.5.0/_modules/doctr/datasets/sroie.html b/v0.5.0/_modules/doctr/datasets/sroie.html index 66fd4ca3e0..04cf10bda2 100644 --- a/v0.5.0/_modules/doctr/datasets/sroie.html +++ b/v0.5.0/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.sroie

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import csv
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['SROIE']
+__all__ = ["SROIE"]
 
 
 
-[docs] +[docs] class SROIE(VisionDataset): """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction" <https://arxiv.org/pdf/2103.10213.pdf>`_. - Example:: - >>> from doctr.datasets import SROIE - >>> train_set = SROIE(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0 + :align: center + + >>> from doctr.datasets import SROIE + >>> train_set = SROIE(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_train_task1.zip', - 'd4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_test.zip', - '41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0", + "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f", + "sroie2019_train_task1.zip", + ) + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0", + "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2", + "sroie2019_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - self.sample_transforms = sample_transforms self.train = train - if rotated_bbox: - raise NotImplementedError + tmp_root = os.path.join(self.root, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + np_dtype = np.float32 - # # List images - self.root = os.path.join(self._root, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - _targets = [] - with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f: - for row in csv.reader(f, delimiter=','): - # Safeguard for blank lines - if len(row) > 0: - # Label may contain commas - label = ",".join(row[8:]) - # Reduce 8 coords to 4 - p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x, p4_y = map(int, row[:8]) - left, right = min(p1_x, p2_x, p3_x, p4_x), max(p1_x, p2_x, p3_x, p4_x) - top, bot = min(p1_y, p2_y, p3_y, p4_y), max(p1_y, p2_y, p3_y, p4_y) - if len(label) > 0: - _targets.append((label, [left, top, right, bot])) - - text_targets, box_targets = zip(*_targets) - - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets))) + with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f: + _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0] + + labels = [",".join(row[8:]) for row in _rows] + # reorder coordinates (8 -> (4,2) -> + # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines + coords: np.ndarray = np.stack( + [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0 + ) + + if not use_polygons: + # xmin, ymin, xmax, ymax + coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1) + + if recognition_task: + crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords) + for crop, label in zip(crops, labels): + if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0: + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, coords)) + else: + self.data.append((img_path, dict(boxes=coords, labels=labels))) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -390,8 +444,8 @@

Source code for doctr.datasets.sroie

       
     
   
-
- + + diff --git a/v0.5.0/_modules/doctr/datasets/svhn.html b/v0.5.0/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/v0.5.0/_modules/doctr/datasets/svhn.html +++ b/v0.5.0/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.5.0/_modules/doctr/datasets/svt.html b/v0.5.0/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/v0.5.0/_modules/doctr/datasets/svt.html +++ b/v0.5.0/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.5.0/_modules/doctr/datasets/synthtext.html b/v0.5.0/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/v0.5.0/_modules/doctr/datasets/synthtext.html +++ b/v0.5.0/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.5.0/_modules/doctr/datasets/utils.html b/v0.5.0/_modules/doctr/datasets/utils.html index 2259698c0f..bde9304597 100644 --- a/v0.5.0/_modules/doctr/datasets/utils.html +++ b/v0.5.0/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.utils

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import string
 import unicodedata
+from collections.abc import Sequence
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Sequence as SequenceType
+
 import numpy as np
-from typing import List, Optional, Any
+from PIL import Image
+
+from doctr.io.image import get_img_shape
+from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
 
 from .vocabs import VOCABS
 
-__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
+__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences", "pre_transform_multiclass"]
+
+ImageTensor = TypeVar("ImageTensor")
 
 
 def translate(
     input_string: str,
     vocab_name: str,
-    unknown_char: str = '■',
+    unknown_char: str = "■",
 ) -> str:
     """Translate a string input in a given vocabulary
 
     Args:
+    ----
         input_string: input string to translate
         vocab_name: vocabulary to use (french, latin, ...)
         unknown_char: unknown character for non-translatable characters
 
     Returns:
-        A string translated in a given vocab"""
-
+    -------
+        A string translated in a given vocab
+    """
     if VOCABS.get(vocab_name) is None:
         raise KeyError("output vocabulary must be in vocabs dictionnary")
 
-    translated = ''
+    translated = ""
     for char in input_string:
         if char not in VOCABS[vocab_name]:
             # we need to translate char into a vocab char
@@ -315,51 +350,63 @@ 

Source code for doctr.datasets.utils

                 # remove whitespaces
                 continue
             # normalize character if it is not in vocab
-            char = unicodedata.normalize('NFD', char).encode('ascii', 'ignore').decode('ascii')
-            if char == '' or char not in VOCABS[vocab_name]:
+            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
+            if char == "" or char not in VOCABS[vocab_name]:
                 # if normalization fails or char still not in vocab, return unknown character)
                 char = unknown_char
         translated += char
     return translated
 
 
-def encode_sequence(
+def encode_string(
     input_string: str,
     vocab: str,
 ) -> List[int]:
     """Given a predefined mapping, encode the string to a sequence of numbers
 
     Args:
+    ----
         input_string: string to encode
         vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A list encoding the input_string"""
-
-    return list(map(vocab.index, input_string))  # type: ignore[arg-type]
+    -------
+        A list encoding the input_string
+    """
+    try:
+        return list(map(vocab.index, input_string))
+    except ValueError:
+        raise ValueError(
+            f"some characters cannot be found in 'vocab'. \
+                         Please check the input string {input_string} and the vocabulary {vocab}"
+        )
 
 
 def decode_sequence(
-    input_array: np.array,
+    input_seq: Union[np.ndarray, SequenceType[int]],
     mapping: str,
 ) -> str:
     """Given a predefined mapping, decode the sequence of numbers to a string
 
     Args:
-        input_array: array to decode
+    ----
+        input_seq: array to decode
         mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A string, decoded from input_array"""
-
-    if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
+    -------
+        A string, decoded from input_seq
+    """
+    if not isinstance(input_seq, (Sequence, np.ndarray)):
+        raise TypeError("Invalid sequence type")
+    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
         raise AssertionError("Input must be an array of int, with max less than mapping size")
-    decoded = ''.join(mapping[idx] for idx in input_array)
-    return decoded
+
+    return "".join(map(mapping.__getitem__, input_seq))
 
 
 
-[docs] +[docs] def encode_sequences( sequences: List[str], vocab: str, @@ -367,48 +414,53 @@

Source code for doctr.datasets.utils

     eos: int = -1,
     sos: Optional[int] = None,
     pad: Optional[int] = None,
-    **kwargs: Any,
+    dynamic_seq_length: bool = False,
 ) -> np.ndarray:
     """Encode character sequences using a given vocab as mapping
 
     Args:
+    ----
         sequences: the list of character sequences of size N
         vocab: the ordered vocab to use for encoding
         target_size: maximum length of the encoded data
         eos: encoding of End Of String
         sos: optional encoding of Start Of String
         pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
+        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
 
     Returns:
+    -------
         the padded encoded data as a tensor
     """
-
     if 0 <= eos < len(vocab):
         raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
 
-    if not isinstance(target_size, int):
-        target_size = max(len(w) for w in sequences)
-        if sos:
-            target_size += 1
-        if pad:
-            target_size += 1
+    if not isinstance(target_size, int) or dynamic_seq_length:
+        # Maximum string length + EOS
+        max_length = max(len(w) for w in sequences) + 1
+        if isinstance(sos, int):
+            max_length += 1
+        if isinstance(pad, int):
+            max_length += 1
+        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
 
     # Pad all sequences
-    if pad:  # pad with padding symbol
+    if isinstance(pad, int):  # pad with padding symbol
         if 0 <= pad < len(vocab):
             raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
         # In that case, add EOS at the end of the word before padding
-        encoded_data = np.full([len(sequences), target_size], pad, dtype=np.int32)
+        default_symbol = pad
     else:  # pad with eos symbol
-        encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32)
+        default_symbol = eos
+    encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
 
-    for idx, seq in enumerate(sequences):
-        encoded_seq = encode_sequence(seq, vocab)
-        if pad:  # add eos at the end of the sequence
-            encoded_seq.append(eos)
-        encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)]
+    # Encode the strings
+    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
+        if isinstance(pad, int):  # add eos at the end of the sequence
+            seq.append(eos)
+        encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)]
 
-    if sos:  # place eos symbol at the beginning of each sequence
+    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
         if 0 <= sos < len(vocab):
             raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
         encoded_data = np.roll(encoded_data, 1)
@@ -416,6 +468,59 @@ 

Source code for doctr.datasets.utils

 
     return encoded_data
+ + +def convert_target_to_relative( + img: ImageTensor, target: Union[np.ndarray, Dict[str, Any]] +) -> Tuple[ImageTensor, Union[Dict[str, Any], np.ndarray]]: + if isinstance(target, np.ndarray): + target = convert_to_relative_coords(target, get_img_shape(img)) + else: + target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img)) + return img, target + + +def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]: + """Crop a set of bounding boxes from an image + + Args: + ---- + img_path: path to the image + geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4) + + Returns: + ------- + a list of cropped images + """ + with Image.open(img_path) as pil_img: + img: np.ndarray = np.array(pil_img.convert("RGB")) + # Polygon + if geoms.ndim == 3 and geoms.shape[1:] == (4, 2): + return extract_rcrops(img, geoms.astype(dtype=int)) + if geoms.ndim == 2 and geoms.shape[1] == 4: + return extract_crops(img, geoms.astype(dtype=int)) + raise ValueError("Invalid geometry format") + + +def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.ndarray, Dict[str, List]]: + """Converts multiclass target to relative coordinates. + + Args: + ---- + img: Image + target: tuple of target polygons and their classes names + + Returns: + ------- + Image and dictionary of boxes, with class names as keys + """ + boxes = convert_to_relative_coords(target[0], get_img_shape(img)) + boxes_classes = target[1] + boxes_dict: Dict = {k: [] for k in sorted(set(boxes_classes))} + for k, poly in zip(boxes_classes, boxes): + boxes_dict[k].append(poly) + boxes_dict = {k: np.stack(v, axis=0) for k, v in boxes_dict.items()} + return img, boxes_dict
@@ -448,8 +553,8 @@

Source code for doctr.datasets.utils

       
     
   
- - + + diff --git a/v0.5.0/_modules/doctr/datasets/wildreceipt.html b/v0.5.0/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.5.0/_modules/doctr/datasets/wildreceipt.html +++ b/v0.5.0/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.5.0/_modules/doctr/documents/elements.html b/v0.5.0/_modules/doctr/documents/elements.html deleted file mode 100644 index 10c1e142d2..0000000000 --- a/v0.5.0/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,577 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox, RotatedBbox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator, arg-type] - - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/documents/reader.html b/v0.5.0/_modules/doctr/documents/reader.html deleted file mode 100644 index cdcd814b6c..0000000000 --- a/v0.5.0/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,612 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence, Dict
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/io/elements.html b/v0.5.0/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/v0.5.0/_modules/doctr/io/elements.html +++ b/v0.5.0/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.5.0/_modules/doctr/io/html.html b/v0.5.0/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/v0.5.0/_modules/doctr/io/html.html +++ b/v0.5.0/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.5.0/_modules/doctr/io/image/base.html b/v0.5.0/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/v0.5.0/_modules/doctr/io/image/base.html +++ b/v0.5.0/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.5.0/_modules/doctr/io/image/tensorflow.html b/v0.5.0/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/v0.5.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.5.0/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.5.0/_modules/doctr/io/pdf.html b/v0.5.0/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/v0.5.0/_modules/doctr/io/pdf.html +++ b/v0.5.0/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.5.0/_modules/doctr/io/reader.html b/v0.5.0/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/v0.5.0/_modules/doctr/io/reader.html +++ b/v0.5.0/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.5.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.5.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/v0.5.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.5.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.5.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/v0.5.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.5.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.5.0/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/v0.5.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.5.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.5.0/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.5.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.5.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.5.0/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/v0.5.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.5.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.5.0/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/v0.5.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.5.0/_modules/doctr/models/classification/zoo.html b/v0.5.0/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/v0.5.0/_modules/doctr/models/classification/zoo.html +++ b/v0.5.0/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.5.0/_modules/doctr/models/detection/differentiable_binarization.html b/v0.5.0/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.5.0/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.5.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 9145c7c3fd..66cef8663d 100644 --- a/v0.5.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import List, Tuple, Optional, Any, Dict
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+from tensorflow.keras.applications import ResNet50
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
+
+from ...classification import mobilenet_v3_large
 from .base import DBPostProcessor, _DBNet
 
-__all__ = ['DBNet', 'db_resnet50']
+__all__ = ["DBNet", "db_resnet50", "db_mobilenet_v3_large"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
+    "db_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_resnet50-649fa22b.weights.h5&src=0",
+    },
+    "db_mobilenet_v3_large": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_mobilenet_v3_large-ee2e1dbe.weights.h5&src=0",
     },
 }
 
@@ -313,6 +348,7 @@ 

Source code for doctr.models.detection.differentiable_binarization.tensorflo <https://arxiv.org/pdf/1612.03144.pdf>`_. Args: + ---- channels: number of channel to output """ @@ -322,9 +358,9 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo ) -> None: super().__init__() self.channels = channels - self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest') - self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)] - self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)] + self.upsample = layers.UpSampling2D(size=(2, 2), interpolation="nearest") + self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer="he_normal") for _ in range(4)] + self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2**idx) for idx in range(4)] @staticmethod def build_upsampling( @@ -334,20 +370,21 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo """Module which performs a 3x3 convolution followed by up-sampling Args: + ---- channels: number of output channels dilation_factor (int): dilation factor to scale the convolution output before concatenation Returns: + ------- a keras.layers.Layer object, wrapping these operations in a sequential module """ - - _layers = conv_sequence(channels, 'relu', True, kernel_size=3) + _layers = conv_sequence(channels, "relu", True, kernel_size=3) if dilation_factor > 1: - _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest')) + _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation="nearest")) - module = keras.Sequential(_layers) + module = Sequential(_layers) return module @@ -359,7 +396,6 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo x: List[tf.Tensor], **kwargs: Any, ) -> tf.Tensor: - # Channel mapping results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)] # Upsample & sum @@ -371,200 +407,324 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo return layers.concatenate(results) -class DBNet(_DBNet, keras.Model, NestedObject): +class DBNet(_DBNet, Model, NestedObject): """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_. Args: + ---- feature extractor: the backbone serving as feature extractor fpn_channels: number of channels each extracted feature maps is mapped to + bin_thresh: threshold for binarization + box_thresh: minimal objectness score to consider a box + assume_straight_pages: if True, fit straight bounding boxes only + exportable: onnx exportable returns only logits + cfg: the configuration dict of the model + class_names: list of class names """ - _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "fpn", "probability_head", "threshold_head", "postprocessor"] def __init__( self, feature_extractor: IntermediateLayerGetter, - fpn_channels: int = 128, - rotated_bbox: bool = False, + fpn_channels: int = 128, # to be set to 256 to represent the author's initial idea + bin_thresh: float = 0.3, + box_thresh: float = 0.1, + assume_straight_pages: bool = True, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, + class_names: List[str] = [CLASS_NAME], ) -> None: - super().__init__() + self.class_names = class_names + num_classes: int = len(self.class_names) self.cfg = cfg self.feat_extractor = feature_extractor - self.rotated_bbox = rotated_bbox + self.exportable = exportable + self.assume_straight_pages = assume_straight_pages self.fpn = FeaturePyramidNetwork(channels=fpn_channels) # Initialize kernels _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape] output_shape = tuple(self.fpn(_inputs).shape) - self.probability_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] + self.probability_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + self.threshold_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + + self.postprocessor = DBPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh ) - self.threshold_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] - ) - - self.postprocessor = DBPostProcessor(rotated_bbox=rotated_bbox) def compute_loss( self, out_map: tf.Tensor, thresh_map: tf.Tensor, - target: List[Dict[str, Any]] + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes and a list of masks for each image. From there it computes the loss with the model output Args: + ---- out_map: output feature map of the model of shape (N, H, W, C) thresh_map: threshold map of shape (N, H, W, C) target: list of dictionary where each dict has a `boxes` and a `flags` entry + gamma: modulating factor in the focal loss formula + alpha: balancing factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") - prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1])) - thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1])) + prob_map = tf.math.sigmoid(out_map) + thresh_map = tf.math.sigmoid(thresh_map) - seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) + seg_target, seg_mask, thresh_target, thresh_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32) + seg_mask = tf.cast(seg_mask, tf.float32) + thresh_target = tf.convert_to_tensor(thresh_target, dtype=out_map.dtype) thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool) - # Compute balanced BCE loss for proba_map - bce_scale = 5. - bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask] - - neg_target = 1 - seg_target[seg_mask] - positive_count = tf.math.reduce_sum(seg_target[seg_mask]) - negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count]) - negative_loss = bce_loss * neg_target - negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32)) - sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss) - balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6) - - # Compute dice loss for approxbin_map - bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask]))) - - bce_min = tf.math.reduce_min(bce_loss) - weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1. - inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights) - union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8 - dice_loss = 1 - 2.0 * inter / union + # Focal loss + focal_scale = 10.0 + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + + # Convert logits to prob, compute gamma factor + p_t = (seg_target * prob_map) + ((1 - seg_target) * (1 - prob_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class or for approx binary_map + if len(self.class_names) > 1: + dice_map = tf.nn.softmax(out_map, axis=-1) + else: + # compute binary map instead + dice_map = 1.0 / (1.0 + tf.exp(-50 * (prob_map - thresh_map))) + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) # Compute l1 loss for thresh_map - l1_scale = 10. if tf.reduce_any(thresh_mask): - l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask])) + thresh_mask = tf.cast(thresh_mask, tf.float32) + l1_loss = tf.reduce_sum(tf.abs(thresh_map - thresh_target) * thresh_mask) / ( + tf.reduce_sum(thresh_mask) + eps + ) else: - l1_loss = tf.constant(0.) + l1_loss = tf.constant(0.0) - return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss + return l1_loss + focal_scale * focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - feat_maps = self.feat_extractor(x, **kwargs) feat_concat = self.fpn(feat_maps, **kwargs) logits = self.probability_head(feat_concat, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: - # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + if target is None or return_preds: + # Post-process boxes (keep only text predictions) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: thresh_map = self.threshold_head(feat_concat, **kwargs) loss = self.compute_loss(logits, thresh_map, target) - out['loss'] = loss + out["loss"] = loss return out -def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet: +def _db_resnet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) # Feature extractor - resnet = tf.keras.applications.__dict__[_cfg['backbone']]( - include_top=False, - weights=None, - input_shape=_cfg['input_shape'], - pooling=None, + feat_extractor = IntermediateLayerGetter( + backbone_fn( + weights="imagenet" if pretrained_backbone else None, + include_top=False, + pooling=None, + input_shape=_cfg["input_shape"], + ), + fpn_layers, ) + # Build the model + model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + + # Load pretrained parameters + if pretrained: + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) + + return model + + +def _db_mobilenet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained + + # Patch the config + _cfg = deepcopy(default_cfgs[arch]) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = default_cfgs[arch].get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor feat_extractor = IntermediateLayerGetter( - resnet, - _cfg['fpn_layers'], + backbone_fn( + input_shape=_cfg["input_shape"], + include_top=False, + pretrained=pretrained_backbone, + ), + fpn_layers, ) - kwargs['fpn_channels'] = _cfg['fpn_channels'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] - # Build the model model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model
-[docs] +[docs] def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture Returns: + ------- text detection architecture """ + return _db_resnet( + "db_resnet50", + pretrained, + ResNet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
+ + + +
+[docs] +def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet: + """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" + <https://arxiv.org/pdf/1911.08947.pdf>`_, using a mobilenet v3 large backbone. + + >>> import tensorflow as tf + >>> from doctr.models import db_mobilenet_v3_large + >>> model = db_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) - return _db_resnet('db_resnet50', pretrained, **kwargs)
+ Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture + + Returns: + ------- + text detection architecture + """ + return _db_mobilenet( + "db_mobilenet_v3_large", + pretrained, + mobilenet_v3_large, + ["inverted_2", "inverted_5", "inverted_11", "final_block"], + **kwargs, + )

@@ -598,8 +758,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo - - + + diff --git a/v0.5.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.5.0/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.5.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.5.0/_modules/doctr/models/detection/linknet.html b/v0.5.0/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.5.0/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.5.0/_modules/doctr/models/detection/linknet/tensorflow.html index cd4f446673..ce995f99d4 100644 --- a/v0.5.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.linknet.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.classification import resnet18, resnet34, resnet50
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.backbones import ResnetStage
-from doctr.models.utils import conv_sequence, load_pretrained_params
-from .base import LinkNetPostProcessor, _LinkNet
 
-__all__ = ['LinkNet', 'linknet16']
+from .base import LinkNetPostProcessor, _LinkNet
 
+__all__ = ["LinkNet", "linknet_resnet18", "linknet_resnet34", "linknet_resnet50"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet16': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'num_classes': 1,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': None,
+    "linknet_resnet18": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet18-615a82c5.weights.h5&src=0",
+    },
+    "linknet_resnet34": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet34-9d772be5.weights.h5&src=0",
+    },
+    "linknet_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet50-6bf6c8b5.weights.h5&src=0",
     },
 }
 
 
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
+def decoder_block(in_chan: int, out_chan: int, stride: int, **kwargs: Any) -> Sequential:
     """Creates a LinkNet decoder block"""
-
     return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
+        *conv_sequence(in_chan // 4, "relu", True, kernel_size=1, **kwargs),
         layers.Conv2DTranspose(
             filters=in_chan // 4,
             kernel_size=3,
-            strides=2,
+            strides=stride,
             padding="same",
             use_bias=False,
-            kernel_initializer='he_normal'
+            kernel_initializer="he_normal",
         ),
         layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
+        layers.Activation("relu"),
+        *conv_sequence(out_chan, "relu", True, kernel_size=1),
     ])
 
 
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module"""
+class LinkNetFPN(Model, NestedObject):
+    """LinkNet Decoder module"""
 
     def __init__(
         self,
+        out_chans: int,
+        in_shapes: List[Tuple[int, ...]],
     ) -> None:
-
         super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
+        self.out_chans = out_chans
+        strides = [2] * (len(in_shapes) - 1) + [1]
+        i_chans = [s[-1] for s in in_shapes[::-1]]
+        o_chans = i_chans[1:] + [out_chans]
+        self.decoders = [
+            decoder_block(in_chan, out_chan, s, input_shape=in_shape)
+            for in_chan, out_chan, s, in_shape in zip(i_chans, o_chans, strides, in_shapes[::-1])
+        ]
+
+    def call(self, x: List[tf.Tensor], **kwargs: Any) -> tf.Tensor:
+        out = 0
+        for decoder, fmap in zip(self.decoders, x[::-1]):
+            out = decoder(out + fmap, **kwargs)
+        return out
 
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(_LinkNet, keras.Model):
+    def extra_repr(self) -> str:
+        return f"out_chans={self.out_chans}"
+
+
+class LinkNet(_LinkNet, Model):
     """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
     <https://arxiv.org/pdf/1707.03718.pdf>`_.
 
     Args:
-        num_classes: number of channels for the output
+    ----
+        feature extractor: the backbone serving as feature extractor
+        fpn_channels: number of channels each extracted feature maps is mapped to
+        bin_thresh: threshold for binarization of the output feature map
+        box_thresh: minimal objectness score to consider a box
+        assume_straight_pages: if True, fit straight bounding boxes only
+        exportable: onnx exportable returns only logits
+        cfg: the configuration dict of the model
+        class_names: list of class names
     """
 
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
+    _children_names: List[str] = ["feat_extractor", "fpn", "classifier", "postprocessor"]
 
     def __init__(
         self,
-        num_classes: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        rotated_bbox: bool = False,
+        feat_extractor: IntermediateLayerGetter,
+        fpn_channels: int = 64,
+        bin_thresh: float = 0.1,
+        box_thresh: float = 0.1,
+        assume_straight_pages: bool = True,
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
+        class_names: List[str] = [CLASS_NAME],
     ) -> None:
         super().__init__(cfg=cfg)
 
-        self.rotated_bbox = rotated_bbox
+        self.class_names = class_names
+        num_classes: int = len(self.class_names)
 
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
+        self.exportable = exportable
+        self.assume_straight_pages = assume_straight_pages
+
+        self.feat_extractor = feat_extractor
 
-        self.fpn = LinkNetFPN()
+        self.fpn = LinkNetFPN(fpn_channels, [_shape[1:] for _shape in self.feat_extractor.output_shape])
+        self.fpn.build(self.feat_extractor.output_shape)
 
         self.classifier = Sequential([
             layers.Conv2DTranspose(
@@ -393,154 +442,246 @@ 

Source code for doctr.models.detection.linknet.tensorflow

strides=2, padding="same", use_bias=False, - kernel_initializer='he_normal' + kernel_initializer="he_normal", + input_shape=self.fpn.decoders[-1].output_shape[1:], ), layers.BatchNormalization(), - layers.Activation('relu'), - *conv_sequence(32, 'relu', True, strides=1, kernel_size=3), + layers.Activation("relu"), + *conv_sequence(32, "relu", True, kernel_size=3, strides=1), layers.Conv2DTranspose( filters=num_classes, kernel_size=2, strides=2, padding="same", - use_bias=False, - kernel_initializer='he_normal' + use_bias=True, + kernel_initializer="he_normal", ), ]) - self.postprocessor = LinkNetPostProcessor(rotated_bbox=rotated_bbox) + self.postprocessor = LinkNetPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh + ) def compute_loss( self, out_map: tf.Tensor, - target: List[Dict[str, Any]], - focal_loss: bool = False, - alpha: float = .5, - gamma: float = 2., - edge_factor: float = 2., + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute linknet loss, BCE with boosted box edges or focal loss. Focal loss implementation based on <https://github.com/tensorflow/addons/>`_. Args: + ---- out_map: output feature map of the model of shape N x H x W x 1 target: list of dictionary where each dict has a `boxes` and a `flags` entry - focal_loss: if True, use focal loss instead of BCE - edge_factor: boost factor for box edges (in case of BCE) + gamma: modulating factor in the focal loss formula alpha: balancing factor in the focal loss formula - gammma: modulating factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ - seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) - edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) + seg_target, seg_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - - # Get the cross_entropy for each entry - bce = tf.keras.losses.binary_crossentropy( - seg_target[seg_mask], - tf.squeeze(out_map, axis=[-1])[seg_mask], - from_logits=True) - - if focal_loss: - if gamma and gamma < 0: - raise ValueError("Value of gamma should be greater than or equal to zero.") - - # Convert logits to prob, compute gamma factor - pred_prob = tf.sigmoid(tf.squeeze(out_map, axis=[-1])[seg_mask]) - p_t = (seg_target[seg_mask] * pred_prob) + ((1 - seg_target[seg_mask]) * (1 - pred_prob)) - modulating_factor = tf.pow((1.0 - p_t), gamma) - - # Compute alpha factor - alpha_factor = seg_target[seg_mask] * alpha + (1 - seg_target[seg_mask]) * (1 - alpha) - - # compute the final loss - loss = tf.reduce_mean(alpha_factor * modulating_factor * bce) - - else: - # Compute BCE loss with highlighted edges - loss = tf.math.multiply( - 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), - bce - ) - loss = tf.reduce_mean(loss) - - return loss + seg_mask = tf.cast(seg_mask, tf.float32) + + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + proba_map = tf.sigmoid(out_map) + + # Focal loss + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") + # Convert logits to prob, compute gamma factor + p_t = (seg_target * proba_map) + ((1 - seg_target) * (1 - proba_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class + dice_map = tf.nn.softmax(out_map, axis=-1) if len(self.class_names) > 1 else proba_map + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) + + return focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, - focal_loss: bool = True, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - - logits = self.stem(x) - logits = self.fpn(logits) - logits = self.classifier(logits) + feat_maps = self.feat_extractor(x, **kwargs) + logits = self.fpn(feat_maps, **kwargs) + logits = self.classifier(logits, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) + if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: + if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: - loss = self.compute_loss(logits, target, focal_loss) - out['loss'] = loss + loss = self.compute_loss(logits, target) + out["loss"] = loss return out -def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet: +def _linknet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> LinkNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor + feat_extractor = IntermediateLayerGetter( + backbone_fn( + pretrained=pretrained_backbone, + include_top=False, + input_shape=_cfg["input_shape"], + ), + fpn_layers, + ) - kwargs['num_classes'] = _cfg['num_classes'] - kwargs['input_shape'] = _cfg['input_shape'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model - model = LinkNet(cfg=_cfg, **kwargs) + model = LinkNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model -
-[docs] -def linknet16(pretrained: bool = False, **kwargs: Any) -> LinkNet: +
+[docs] +def linknet_resnet18(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet18 + >>> model = linknet_resnet18(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture + + Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet18", + pretrained, + resnet18, + ["resnet_block_1", "resnet_block_3", "resnet_block_5", "resnet_block_7"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet34(pretrained: bool = False, **kwargs: Any) -> LinkNet: """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" <https://arxiv.org/pdf/1707.03718.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet16 - >>> model = linknet16(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet34 + >>> model = linknet_resnet34(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture Returns: + ------- text detection architecture """ + return _linknet( + "linknet_resnet34", + pretrained, + resnet34, + ["resnet_block_2", "resnet_block_6", "resnet_block_12", "resnet_block_15"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet50(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet50 + >>> model = linknet_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture - return _linknet('linknet16', pretrained, **kwargs)
+ Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet50", + pretrained, + resnet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
@@ -574,8 +715,8 @@

Source code for doctr.models.detection.linknet.tensorflow

- +
+ diff --git a/v0.5.0/_modules/doctr/models/detection/zoo.html b/v0.5.0/_modules/doctr/models/detection/zoo.html index d3128b8d14..3651c4e2d3 100644 --- a/v0.5.0/_modules/doctr/models/detection/zoo.html +++ b/v0.5.0/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
 from doctr.file_utils import is_tf_available, is_torch_available
-from .core import DetectionPredictor
-from ..preprocessor import PreProcessor
-from .. import detection
 
+from .. import detection
+from ..detection.fast import reparameterize
+from ..preprocessor import PreProcessor
+from .predictor import DetectionPredictor
 
 __all__ = ["detection_predictor"]
 
+ARCHS: List[str]
+
 
 if is_tf_available():
-    ARCHS = ['db_resnet50', 'linknet16']
+    ARCHS = [
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
 elif is_torch_available():
-    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
+    ARCHS = [
+        "db_resnet34",
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
+
 
+def _predictor(arch: Any, pretrained: bool, assume_straight_pages: bool = True, **kwargs: Any) -> DetectionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
+        _model = detection.__dict__[arch](
+            pretrained=pretrained,
+            pretrained_backbone=kwargs.get("pretrained_backbone", True),
+            assume_straight_pages=assume_straight_pages,
+        )
+        # Reparameterize FAST models by default to lower inference latency and memory usage
+        if isinstance(_model, detection.FAST):
+            _model = reparameterize(_model)
+    else:
+        if not isinstance(arch, (detection.DBNet, detection.LinkNet, detection.FAST)):
+            raise ValueError(f"unknown architecture: {type(arch)}")
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+        _model = arch
+        _model.assume_straight_pages = assume_straight_pages
+        _model.postprocessor.assume_straight_pages = assume_straight_pages
 
-    # Detection
-    _model = detection.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 1)
+    kwargs.pop("pretrained_backbone", None)
+
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 2)
     predictor = DetectionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
-        _model
+        PreProcessor(_model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:], **kwargs),
+        _model,
     )
     return predictor
 
 
 
-[docs] -def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: +[docs] +def detection_predictor( + arch: Any = "fast_base", + pretrained: bool = False, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + batch_size: int = 2, + **kwargs: Any, +) -> DetectionPredictor: """Text detection architecture. - Example:: - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> model = detection_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import detection_predictor + >>> model = detection_predictor(arch='db_resnet50', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_resnet50') + ---- + arch: name of the architecture or model itself to use (e.g. 'db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset + assume_straight_pages: If True, fit straight boxes to the page + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right + batch_size: number of samples the model processes in parallel + **kwargs: optional keyword arguments passed to the architecture Returns: + ------- Detection predictor """ - - return _predictor(arch, pretrained, **kwargs)
+ return _predictor( + arch=arch, + pretrained=pretrained, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + batch_size=batch_size, + **kwargs, + )
@@ -367,8 +449,8 @@

Source code for doctr.models.detection.zoo

       
     
   
- - + + diff --git a/v0.5.0/_modules/doctr/models/export.html b/v0.5.0/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.5.0/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/models/factory/hub.html b/v0.5.0/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/v0.5.0/_modules/doctr/models/factory/hub.html +++ b/v0.5.0/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.5.0/_modules/doctr/models/recognition/crnn.html b/v0.5.0/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.5.0/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.5.0/_modules/doctr/models/recognition/crnn/tensorflow.html index 41cc93dd23..bc64da9a1b 100644 --- a/v0.5.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.crnn.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
+
 import tensorflow as tf
 from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential, Model
-from typing import Tuple, Dict, Any, Optional, List
+from tensorflow.keras.models import Model, Sequential
+
+from doctr.datasets import VOCABS
 
-from ... import backbones
-from ...utils import load_pretrained_params
+from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
+__all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
+    "crnn_vgg16_bn": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["legacy_french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_vgg16_bn-9c188f45.weights.h5&src=0",
     },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
+    "crnn_mobilenet_v3_small": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_small-54850265.weights.h5&src=0",
+    },
+    "crnn_mobilenet_v3_large": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_large-c64045e5.weights.h5&src=0",
     },
 }
 
 
 class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
+    """Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
 
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
         ignore_case: if True, ignore case of letters
         ignore_accents: if True, ignore accents of letters
@@ -325,37 +353,57 @@ 

Source code for doctr.models.recognition.crnn.tensorflow

def __call__( self, - logits: tf.Tensor - ) -> List[Tuple[str, float]]: - """ - Performs decoding of raw output with CTC and decoding of CTC predictions + logits: tf.Tensor, + beam_width: int = 1, + top_paths: int = 1, + ) -> Union[List[Tuple[str, float]], List[Tuple[List[str], List[float]]]]: + """Performs decoding of raw output with CTC and decoding of CTC predictions with label_to_idx mapping dictionnary Args: + ---- logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1 + beam_width: An int scalar >= 0 (beam search beam width). + top_paths: An int scalar >= 0, <= beam_width (controls output size). Returns: + ------- A list of decoded words of length BATCH_SIZE + """ # Decode CTC _decoded, _log_prob = tf.nn.ctc_beam_search_decoder( tf.transpose(logits, perm=[1, 0, 2]), - tf.fill(logits.shape[0], logits.shape[1]), - beam_width=1, top_paths=1, + tf.fill(tf.shape(logits)[:1], tf.shape(logits)[1]), + beam_width=beam_width, + top_paths=top_paths, ) - out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab)) - probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) + + _decoded = tf.sparse.concat( + 1, + [tf.sparse.expand_dims(dec, axis=1) for dec in _decoded], + expand_nonconcat_dims=True, + ) # dim : batchsize x beamwidth x actual_max_len_predictions + out_idxs = tf.sparse.to_dense(_decoded, default_value=len(self.vocab)) # Map it to characters _decoded_strings_pred = tf.strings.reduce_join( inputs=tf.nn.embedding_lookup(tf.constant(self._embedding, dtype=tf.string), out_idxs), - axis=-1 + axis=-1, ) _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] - word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - + decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value="not valid")[ + :, :, 0 + ] # dim : batch_size x beam_width + + if top_paths == 1: + probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) # dim : batchsize + decoded_strings_pred = tf.squeeze(decoded_strings_pred, axis=1) + word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] + else: + probs = tf.math.exp(_log_prob) # dim : batchsize x beamwidth + word_values = [[word.decode() for word in words] for words in decoded_strings_pred.numpy().tolist()] return list(zip(word_values, probs.numpy().tolist())) @@ -364,19 +412,26 @@

Source code for doctr.models.recognition.crnn.tensorflow

Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of units in the LSTM layers + exportable: onnx exportable returns only logits + beam_width: beam width for beam search decoding + top_paths: number of top paths for beam search decoding cfg: configuration dictionary """ - _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "decoder", "postprocessor"] def __init__( self, - feature_extractor: tf.keras.Model, + feature_extractor: Model, vocab: str, rnn_units: int = 128, + exportable: bool = False, + beam_width: int = 1, + top_paths: int = 1, cfg: Optional[Dict[str, Any]] = None, ) -> None: # Initialize kernels @@ -386,19 +441,21 @@

Source code for doctr.models.recognition.crnn.tensorflow

self.vocab = vocab self.max_length = w self.cfg = cfg + self.exportable = exportable self.feat_extractor = feature_extractor - self.decoder = Sequential( - [ - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Dense(units=len(vocab) + 1) - ] - ) + self.decoder = Sequential([ + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Dense(units=len(vocab) + 1), + ]) self.decoder.build(input_shape=(None, w, h * c)) self.postprocessor = CTCPostProcessor(vocab=vocab) + self.beam_width = beam_width + self.top_paths = top_paths + def compute_loss( self, model_output: tf.Tensor, @@ -407,16 +464,17 @@

Source code for doctr.models.recognition.crnn.tensorflow

"""Compute CTC loss for the model. Args: - gt: the encoded tensor with gt labels + ---- model_output: predicted logits of the model - seq_len: lengths of each gt word inside the batch + target: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) batch_len = model_output.shape[0] - input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) + input_length = tf.fill((batch_len,), model_output.shape[1]) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -428,8 +486,12 @@

Source code for doctr.models.recognition.crnn.tensorflow

target: Optional[List[str]] = None, return_model_output: bool = False, return_preds: bool = False, + beam_width: int = 1, + top_paths: int = 1, **kwargs: Any, ) -> Dict[str, Any]: + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") features = self.feat_extractor(x, **kwargs) # B x H x W x C --> B x W x H x C @@ -437,91 +499,132 @@

Source code for doctr.models.recognition.crnn.tensorflow

w, h, c = transposed_feat.get_shape().as_list()[1:] # B x W x H x C --> B x W x H * C features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c)) - logits = self.decoder(features_seq, **kwargs) + logits = _bf16_to_float32(self.decoder(features_seq, **kwargs)) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = logits + return out + if return_model_output: out["out_map"] = logits if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(logits) + out["preds"] = self.postprocessor(logits, beam_width=beam_width, top_paths=top_paths) if target is not None: - out['loss'] = self.compute_loss(logits, target) + out["loss"] = self.compute_loss(logits, target) return out -def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: +def _crnn( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> CRNN: + pretrained_backbone = pretrained_backbone and not pretrained + + kwargs["vocab"] = kwargs.get("vocab", default_cfgs[arch]["vocab"]) - # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) + _cfg["vocab"] = kwargs["vocab"] + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] - # Feature extractor - feat_extractor = backbones.__dict__[_cfg['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + input_shape=_cfg["input_shape"], include_top=False, + pretrained=pretrained_backbone, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - # Build the model model = CRNN(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params(model, _cfg["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]) return model
-[docs] +[docs] def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_vgg16_bn + >>> model = crnn_vgg16_bn(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_vgg16_bn", pretrained, vgg16_bn_r, **kwargs)
+ + + +
+[docs] +def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based + Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_small + >>> model = crnn_mobilenet_v3_small(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
+ Returns: + ------- + text recognition architecture + """ + return _crnn("crnn_mobilenet_v3_small", pretrained, mobilenet_v3_small_r, **kwargs)
-def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based +
+[docs] +def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_large + >>> model = crnn_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_mobilenet_v3_large", pretrained, mobilenet_v3_large_r, **kwargs)
- return _crnn('crnn_resnet31', pretrained, **kwargs)
@@ -554,8 +657,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

- +
+ diff --git a/v0.5.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.5.0/_modules/doctr/models/recognition/master/tensorflow.html index 2dc5a27717..aa6aa69325 100644 --- a/v0.5.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.master.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import tensorflow as tf
-from tensorflow.keras import layers, Sequential, Model
-from typing import Tuple, List, Dict, Any, Optional
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
 
-from ..core import RecognitionPostProcessor
-from ...backbones.resnet import ResnetStage
-from ...utils import conv_sequence, load_pretrained_params
-from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask
-from ....datasets import VOCABS
-from .base import _MASTER, _MASTERPostProcessor
+import tensorflow as tf
+from tensorflow.keras import Model, layers
+
+from doctr.datasets import VOCABS
+from doctr.models.classification import magc_resnet31
+from doctr.models.modules.transformer import Decoder, PositionalEncoding
 
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from .base import _MASTER, _MASTERPostProcessor
 
-__all__ = ['MASTER', 'master', 'MASTERPostProcessor']
+__all__ = ["MASTER", "master"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'master': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'input_shape': (48, 160, 3),
-        'vocab': VOCABS['french'],
-        'url': None,
+    "master": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/master-d7fdaeff.weights.h5&src=0",
     },
 }
 
 
-class MAGC(layers.Layer):
-
-    """Implements the Multi-Aspect Global Context Attention, as described in
-    <https://arxiv.org/pdf/1910.02562.pdf>`_.
-
-    Args:
-        inplanes: input channels
-        headers: number of headers to split channels
-        att_scale: if True, re-scale attention to counteract the variance distibutions
-        **kwargs
-    """
-
-    def __init__(
-        self,
-        inplanes: int,
-        headers: int = 1,
-        att_scale: bool = False,
-        **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-
-        self.headers = headers  # h
-        self.inplanes = inplanes  # C
-        self.att_scale = att_scale
-
-        self.single_header_inplanes = int(inplanes / headers)  # C / h
-
-        self.conv_mask = tf.keras.layers.Conv2D(
-            filters=1,
-            kernel_size=1,
-            kernel_initializer=tf.initializers.he_normal()
-        )
-
-        self.transform = tf.keras.Sequential(
-            [
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-                tf.keras.layers.LayerNormalization([1, 2, 3]),
-                tf.keras.layers.ReLU(),
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-            ],
-            name='transform'
-        )
-
-    @tf.function
-    def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor:
-        b, h, w, c = (tf.shape(inputs)[i] for i in range(4))
-
-        # B, H, W, C -->> B*h, H, W, C/h
-        x = tf.reshape(inputs, shape=(b, h, w, self.headers, self.single_header_inplanes))
-        x = tf.transpose(x, perm=(0, 3, 1, 2, 4))
-        x = tf.reshape(x, shape=(b * self.headers, h, w, self.single_header_inplanes))
-
-        # Compute shorcut
-        shortcut = x
-        # B*h, 1, H*W, C/h
-        shortcut = tf.reshape(shortcut, shape=(b * self.headers, 1, h * w, self.single_header_inplanes))
-        # B*h, 1, C/h, H*W
-        shortcut = tf.transpose(shortcut, perm=[0, 1, 3, 2])
-
-        # Compute context mask
-        # B*h, H, W, 1,
-        context_mask = self.conv_mask(x)
-        # B*h, 1, H*W, 1
-        context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1))
-        # scale variance
-        if self.att_scale and self.headers > 1:
-            context_mask = context_mask / tf.sqrt(self.single_header_inplanes)
-        # B*h, 1, H*W, 1
-        context_mask = tf.keras.activations.softmax(context_mask, axis=2)
-
-        # Compute context
-        # B*h, 1, C/h, 1
-        context = tf.matmul(shortcut, context_mask)
-        context = tf.reshape(context, shape=(b, 1, c, 1))
-        # B, 1, 1, C
-        context = tf.transpose(context, perm=(0, 1, 3, 2))
-        # Set shape to resolve shape when calling this module in the Sequential MAGCResnet
-        batch, chan = inputs.get_shape().as_list()[0], inputs.get_shape().as_list()[-1]
-        context.set_shape([batch, 1, 1, chan])
-        return context
-
-    def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
-        # Context modeling: B, H, W, C  ->  B, 1, 1, C
-        context = self.context_modeling(inputs)
-        # Transform: B, 1, 1, C  ->  B, 1, 1, C
-        transformed = self.transform(context)
-        return inputs + transformed
-
-
-class MAGCResnet(Sequential):
-
-    """Implements the modified resnet with MAGC layers, as described in paper.
-
-    Args:
-        headers: number of header to split channels in MAGC layers
-        input_shape: shape of the model input (without batch dim)
-    """
-
-    def __init__(
-        self,
-        headers: int = 1,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
-    ) -> None:
-        _layers = [
-            # conv_1x
-            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
-            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_2x
-            ResnetStage(num_blocks=1, output_channels=256),
-            MAGC(inplanes=256, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=256, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_3x
-            ResnetStage(num_blocks=2, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 1), (2, 1)),
-            # conv_4x
-            ResnetStage(num_blocks=5, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            # conv_5x
-            ResnetStage(num_blocks=3, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-        ]
-        super().__init__(_layers)
-
-
 class MASTER(_MASTER, Model):
-
     """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
     Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_.
 
     Args:
+    ----
+        feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary, (without EOS, SOS, PAD)
         d_model: d parameter for the transformer decoder
-        headers: headers for the MAGC module
         dff: depth of the pointwise feed-forward layer
         num_heads: number of heads for the mutli-head attention module
         num_layers: number of decoder layers to stack
         max_length: maximum length of character sequence handled by the model
-        input_size: size of the image inputs
+        dropout: dropout probability of the decoder
+        input_shape: size of the image inputs
+        exportable: onnx exportable returns only logits
+        cfg: dictionary containing information about the model
     """
 
     def __init__(
         self,
+        feature_extractor: Model,
         vocab: str,
         d_model: int = 512,
-        headers: int = 1,
         dff: int = 2048,
-        num_heads: int = 8,
+        num_heads: int = 8,  # number of heads in the transformer decoder
         num_layers: int = 3,
         max_length: int = 50,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
+        dropout: float = 0.2,
+        input_shape: Tuple[int, int, int] = (32, 128, 3),  # different from the paper
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__()
 
-        self.vocab = vocab
+        self.exportable = exportable
         self.max_length = max_length
+        self.d_model = d_model
+        self.vocab = vocab
         self.cfg = cfg
         self.vocab_size = len(vocab)
 
-        self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape)
-        self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model)  # 3 more classes: EOS/PAD/SOS
+        self.feat_extractor = feature_extractor
+        self.positional_encoding = PositionalEncoding(self.d_model, dropout, max_len=input_shape[0] * input_shape[1])
 
         self.decoder = Decoder(
             num_layers=num_layers,
-            d_model=d_model,
+            d_model=self.d_model,
             num_heads=num_heads,
+            vocab_size=self.vocab_size + 3,  # EOS, SOS, PAD
             dff=dff,
-            vocab_size=self.vocab_size,
-            maximum_position_encoding=max_length,
+            dropout=dropout,
+            maximum_position_encoding=self.max_length,
         )
-        self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model)
-        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
 
+        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
         self.postprocessor = MASTERPostProcessor(vocab=self.vocab)
 
     @tf.function
-    def make_mask(self, target: tf.Tensor) -> tf.Tensor:
-        look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
-        target_padding_mask = create_padding_mask(target, self.vocab_size + 2)  # Pad symbol
-        combined_mask = tf.maximum(target_padding_mask, look_ahead_mask)
-        return combined_mask
+    def make_source_and_target_mask(self, source: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+        # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
+        # (N, 1, 1, max_length)
+        target_pad_mask = tf.cast(tf.math.not_equal(target, self.vocab_size + 2), dtype=tf.uint8)
+        target_pad_mask = target_pad_mask[:, tf.newaxis, tf.newaxis, :]
+        target_length = target.shape[1]
+        # sub mask filled diagonal with 1 = see 0 = masked (max_length, max_length)
+        target_sub_mask = tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
+        # source mask filled with ones (max_length, positional_encoded_seq_len)
+        source_mask = tf.ones((target_length, source.shape[1]))
+        # combine the two masks into one boolean mask where False is masked (N, 1, max_length, max_length)
+        target_mask = tf.math.logical_and(
+            tf.cast(target_sub_mask, dtype=tf.bool), tf.cast(target_pad_mask, dtype=tf.bool)
+        )
+        return source_mask, target_mask
 
+    @staticmethod
     def compute_loss(
-        self,
         model_output: tf.Tensor,
         gt: tf.Tensor,
         seq_len: List[int],
@@ -512,11 +413,13 @@ 

Source code for doctr.models.recognition.master.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -532,7 +435,7 @@

Source code for doctr.models.recognition.master.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) @@ -547,94 +450,103 @@

Source code for doctr.models.recognition.master.tensorflow

"""Call function for training Args: + ---- x: images target: list of str labels return_model_output: if True, return logits return_preds: if True, decode logits + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A dictionnary containing eventually loss, logits and predictions. """ - # Encode - feature = self.feature_extractor(x, **kwargs) - b, h, w, c = (tf.shape(feature)[i] for i in range(4)) + feature = self.feat_extractor(x, **kwargs) + b, h, w, c = feature.get_shape() + # (N, H, W, C) --> (N, H * W, C) feature = tf.reshape(feature, shape=(b, h * w, c)) - encoded = feature + self.feature_pe[:, :h * w, :] + # add positional encoding to features + encoded = self.positional_encoding(feature, **kwargs) out: Dict[str, tf.Tensor] = {} + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") + if target is not None: # Compute target: tensor of gts and sequence lengths - gt, seq_len = self.compute_target(target) - - if kwargs.get('training', False): - if target is None: - raise AssertionError("In training mode, you need to pass a value to 'target'") - tgt_mask = self.make_mask(gt) + gt, seq_len = self.build_target(target) + # Compute decoder masks + source_mask, target_mask = self.make_source_and_target_mask(encoded, gt) # Compute logits - output = self.decoder(gt, encoded, tgt_mask, None, **kwargs) + output = self.decoder(gt, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) - else: - # When not training, we want to compute logits in with the decoder, although - # we have access to gts (we need gts to compute the loss, but not in the decoder) logits = self.decode(encoded, **kwargs) + logits = _bf16_to_float32(logits) + + if self.exportable: + out["logits"] = logits + return out + if target is not None: - out['loss'] = self.compute_loss(logits, gt, seq_len) + out["loss"] = self.compute_loss(logits, gt, seq_len) if return_model_output: - out['out_map'] = logits + out["out_map"] = logits if return_preds: - predictions = self.postprocessor(logits) - out['preds'] = predictions + out["preds"] = self.postprocessor(logits) return out + @tf.function def decode(self, encoded: tf.Tensor, **kwargs: Any) -> tf.Tensor: """Decode function for prediction Args: + ---- encoded: encoded features + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A Tuple of tf.Tensor: predictions, logits """ - b = tf.shape(encoded)[0] - max_len = tf.constant(self.max_length, dtype=tf.int32) + b = encoded.shape[0] + start_symbol = tf.constant(self.vocab_size + 1, dtype=tf.int32) # SOS padding_symbol = tf.constant(self.vocab_size + 2, dtype=tf.int32) # PAD - ys = tf.fill(dims=(b, max_len - 1), value=padding_symbol) + ys = tf.fill(dims=(b, self.max_length - 1), value=padding_symbol) start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols - # max_len = len + 2 (sos + eos) + # Final dimension include EOS/SOS/PAD for i in range(self.max_length - 1): - ys_mask = self.make_mask(ys) - output = self.decoder(ys, encoded, ys_mask, None, **kwargs) + source_mask, target_mask = self.make_source_and_target_mask(encoded, ys) + output = self.decoder(ys, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) prob = tf.nn.softmax(logits, axis=-1) - next_word = tf.argmax(prob, axis=-1, output_type=ys.dtype) - # ys.shape = B, T - i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(max_len), indexing='ij') + next_token = tf.argmax(prob, axis=-1, output_type=ys.dtype) + # update ys with the next token and ignore the first token (SOS) + i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(self.max_length), indexing="ij") indices = tf.stack([i_mesh[:, i + 1], j_mesh[:, i + 1]], axis=1) - ys = tf.tensor_scatter_nd_update(ys, indices, next_word[:, i + 1]) + ys = tf.tensor_scatter_nd_update(ys, indices, next_token[:, i]) - # final_logits of shape (N, max_length - 1, vocab_size + 1) (whithout sos) + # Shape (N, max_length, vocab_size + 1) return logits class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures + Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -649,51 +561,66 @@

Source code for doctr.models.recognition.master.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _master(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> MASTER: +def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool = True, **kwargs: Any) -> MASTER: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"]) + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) - kwargs['vocab'] = _cfg['vocab'] + kwargs["vocab"] = _cfg["vocab"] + kwargs["input_shape"] = _cfg["input_shape"] # Build the model - model = MASTER(cfg=_cfg, **kwargs) + model = MASTER( + backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg["input_shape"], include_top=False), + cfg=_cfg, + **kwargs, + ) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model
-[docs] +[docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import master - >>> model = master(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + + >>> import tensorflow as tf + >>> from doctr.models import master + >>> model = master(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keywoard arguments passed to the MASTER architecture + Returns: + ------- text recognition architecture """ - - return _master('master', pretrained, **kwargs)
+ return _master("master", pretrained, magc_resnet31, **kwargs)
@@ -727,8 +654,8 @@

Source code for doctr.models.recognition.master.tensorflow

- +
+ diff --git a/v0.5.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.5.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/v0.5.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.5.0/_modules/doctr/models/recognition/sar.html b/v0.5.0/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.5.0/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.5.0/_modules/doctr/models/recognition/sar/tensorflow.html index e514e4f0c4..4a591e6451 100644 --- a/v0.5.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.sar.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
+
 import tensorflow as tf
-from tensorflow.keras import Sequential, layers, Model
-from typing import Tuple, Dict, List, Any, Optional
+from tensorflow.keras import Model, Sequential, layers
 
-from ... import backbones
-from ...utils import load_pretrained_params
-from ..core import RecognitionModel, RecognitionPostProcessor
+from doctr.datasets import VOCABS
 from doctr.utils.repr import NestedObject
 
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
+from ...classification import resnet31
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from ..core import RecognitionModel, RecognitionPostProcessor
+
+__all__ = ["SAR", "sar_resnet31"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
+    "sar_resnet31": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/sar_resnet31-5a58806c.weights.h5&src=0",
     },
 }
 
 
+class SAREncoder(layers.Layer, NestedObject):
+    """Implements encoder module of the SAR model
+
+    Args:
+    ----
+        rnn_units: number of hidden rnn units
+        dropout_prob: dropout probability
+    """
+
+    def __init__(self, rnn_units: int, dropout_prob: float = 0.0) -> None:
+        super().__init__()
+        self.rnn = Sequential([
+            layers.LSTM(units=rnn_units, return_sequences=True, recurrent_dropout=dropout_prob),
+            layers.LSTM(units=rnn_units, return_sequences=False, recurrent_dropout=dropout_prob),
+        ])
+
+    def call(
+        self,
+        x: tf.Tensor,
+        **kwargs: Any,
+    ) -> tf.Tensor:
+        # (N, C)
+        return self.rnn(x, **kwargs)
+
+
 class AttentionModule(layers.Layer, NestedObject):
     """Implements attention module of the SAR model
 
     Args:
+    ----
         attention_units: number of hidden attention units
 
     """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
 
+    def __init__(self, attention_units: int) -> None:
         super().__init__()
         self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            3,
+            strides=1,
+            use_bias=True,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
+            1,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.flatten = layers.Flatten()
 
@@ -343,12 +395,12 @@ 

Source code for doctr.models.recognition.sar.tensorflow

hidden_state: tf.Tensor, **kwargs: Any, ) -> tf.Tensor: - [H, W] = features.get_shape().as_list()[1:3] - # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) - hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) # shape (N, H, W, vgg_units) -> (N, H, W, attention_units) features_projection = self.features_projector(features, **kwargs) + # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) + hidden_state = tf.expand_dims(tf.expand_dims(hidden_state, axis=1), axis=1) + hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) projection = tf.math.tanh(hidden_state_projection + features_projection) # shape (N, H, W, attention_units) -> (N, H, W, 1) attention = self.attention_projector(projection, **kwargs) @@ -358,23 +410,25 @@

Source code for doctr.models.recognition.sar.tensorflow

# shape (N, H * W) -> (N, H, W, 1) attention_map = tf.reshape(attention, [-1, H, W, 1]) glimpse = tf.math.multiply(features, attention_map) - # shape (N, H * W) -> (N, 1) - glimpse = tf.reduce_sum(glimpse, axis=[1, 2]) - return glimpse + # shape (N, H * W) -> (N, C) + return tf.reduce_sum(glimpse, axis=[1, 2]) class SARDecoder(layers.Layer, NestedObject): """Implements decoder module of the SAR model Args: + ---- rnn_units: number of hidden units in recurrent cells max_length: maximum length of a sequence vocab_size: number of classes in the model alphabet embedding_units: number of hidden embedding units attention_units: number of hidden attention units - num_decoder_layers: number of LSTM layers to stack + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability """ + def __init__( self, rnn_units: int, @@ -382,23 +436,22 @@

Source code for doctr.models.recognition.sar.tensorflow

vocab_size: int, embedding_units: int, attention_units: int, - num_decoder_layers: int = 2, - input_shape: Optional[List[Tuple[Optional[int]]]] = None, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, ) -> None: - super().__init__() self.vocab_size = vocab_size - self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] - ) - self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) - self.attention_module = AttentionModule(attention_units) - self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units)) self.max_length = max_length - # Initialize kernels - if input_shape is not None: - self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units))) + self.embed = layers.Dense(embedding_units, use_bias=False) + self.embed_tgt = layers.Embedding(embedding_units, self.vocab_size + 1) + + self.lstm_cells = layers.StackedRNNCells([ + layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_cells) + ]) + self.attention_module = AttentionModule(attention_units) + self.output_dense = layers.Dense(self.vocab_size + 1, use_bias=True) + self.dropout = layers.Dropout(dropout_prob) def call( self, @@ -407,40 +460,47 @@

Source code for doctr.models.recognition.sar.tensorflow

gt: Optional[tf.Tensor] = None, **kwargs: Any, ) -> tf.Tensor: - - # initialize states (each of shape (N, rnn_units)) - states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=tf.float32 - ) - # run first step of lstm - # holistic: shape (N, rnn_units) - _, states = self.lstm_decoder(holistic, states, **kwargs) - # Initialize with the index of virtual START symbol (placed after <eos>) - symbol = tf.fill(features.shape[0], self.vocab_size + 1) - logits_list = [] - if kwargs.get('training') and gt is None: - raise ValueError('Need to provide labels during training for teacher forcing') - for t in range(self.max_length + 1): # keep 1 step for <eos> - # one-hot symbol with depth vocab_size + 1 - # embeded_symbol: shape (N, embedding_units) - embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs) - logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs) - glimpse = self.attention_module( - features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs, - ) - # logits: shape (N, rnn_units), glimpse: shape (N, 1) - logits = tf.concat([logits, glimpse], axis=-1) - # shape (N, rnn_units + 1) -> (N, vocab_size + 1) - logits = self.output_dense(logits, **kwargs) - # update symbol with predicted logits for t+1 step - if kwargs.get('training'): - symbol = gt[:, t] # type: ignore[index] + if gt is not None: + gt_embedding = self.embed_tgt(gt, **kwargs) + + logits_list: List[tf.Tensor] = [] + + for t in range(self.max_length + 1): # 32 + if t == 0: + # step to init the first states of the LSTMCell + states = self.lstm_cells.get_initial_state( + inputs=None, batch_size=features.shape[0], dtype=features.dtype + ) + prev_symbol = holistic + elif t == 1: + # step to init a 'blank' sequence of length vocab_size + 1 filled with zeros + # (N, vocab_size + 1) --> (N, embedding_units) + prev_symbol = tf.zeros([features.shape[0], self.vocab_size + 1], dtype=features.dtype) + prev_symbol = self.embed(prev_symbol, **kwargs) else: - symbol = tf.argmax(logits, axis=-1) - logits_list.append(logits) - outputs = tf.stack(logits_list, axis=1) # shape (N, max_length + 1, vocab_size + 1) - - return outputs + if gt is not None and kwargs.get("training", False): + # (N, embedding_units) -2 because of <bos> and <eos> (same) + prev_symbol = self.embed(gt_embedding[:, t - 2], **kwargs) + else: + # -1 to start at timestep where prev_symbol was initialized + index = tf.argmax(logits_list[t - 1], axis=-1) + # update prev_symbol with ones at the index of the previous logit vector + prev_symbol = self.embed(self.embed_tgt(index, **kwargs), **kwargs) + + # (N, C), (N, C) take the last hidden state and cell state from current timestep + _, states = self.lstm_cells(prev_symbol, states, **kwargs) + # states = (hidden_state, cell_state) + hidden_state = states[0][0] + # (N, H, W, C), (N, C) --> (N, C) + glimpse = self.attention_module(features, hidden_state, **kwargs) + # (N, C), (N, C) --> (N, 2 * C) + logits = tf.concat([hidden_state, glimpse], axis=1) + logits = self.dropout(logits, **kwargs) + # (N, vocab_size + 1) + logits_list.append(self.output_dense(logits, **kwargs)) + + # (max_length + 1, N, vocab_size + 1) --> (N, max_length + 1, vocab_size + 1) + return tf.transpose(tf.stack(logits_list[1:]), (1, 0, 2)) class SAR(Model, RecognitionModel): @@ -448,17 +508,20 @@

Source code for doctr.models.recognition.sar.tensorflow

Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of hidden units in both encoder and decoder LSTM embedding_units: number of embedding units attention_units: number of hidden units in attention module max_length: maximum word length handled by the model - num_decoders: number of LSTM to stack in decoder layer - + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability for the encoder and decoder + exportable: onnx exportable returns only logits + cfg: dictionary containing information about the model """ - _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "encoder", "decoder", "postprocessor"] def __init__( self, @@ -468,36 +531,34 @@

Source code for doctr.models.recognition.sar.tensorflow

embedding_units: int = 512, attention_units: int = 512, max_length: int = 30, - num_decoders: int = 2, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: - super().__init__() self.vocab = vocab + self.exportable = exportable self.cfg = cfg - self.max_length = max_length + 1 # Add 1 timestep for EOS after the longest word self.feat_extractor = feature_extractor - self.encoder = Sequential( - [ - layers.LSTM(units=rnn_units, return_sequences=True), - layers.LSTM(units=rnn_units, return_sequences=False) - ] - ) - # Initialize the kernels (watch out for reduce_max) - self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:]) - + self.encoder = SAREncoder(rnn_units, dropout_prob) self.decoder = SARDecoder( - rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders, - input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape] + rnn_units, + self.max_length, + len(vocab), + embedding_units, + attention_units, + num_decoder_cells, + dropout_prob, ) self.postprocessor = SARPostProcessor(vocab=vocab) + @staticmethod def compute_loss( - self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: tf.Tensor, @@ -506,11 +567,13 @@

Source code for doctr.models.recognition.sar.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -525,7 +588,7 @@

Source code for doctr.models.recognition.sar.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -536,16 +599,28 @@

Source code for doctr.models.recognition.sar.tensorflow

return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - features = self.feat_extractor(x, **kwargs) - pooled_features = tf.reduce_max(features, axis=1) # vertical max pooling + # vertical max pooling --> (N, C, W) + pooled_features = tf.reduce_max(features, axis=1) + # holistic (N, C) encoded = self.encoder(pooled_features, **kwargs) + if target is not None: - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) seq_len = tf.cast(seq_len, tf.int32) - decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training for teacher forcing") + + decoded_features = _bf16_to_float32( + self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + ) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = decoded_features + return out + if return_model_output: out["out_map"] = decoded_features @@ -554,7 +629,7 @@

Source code for doctr.models.recognition.sar.tensorflow

out["preds"] = self.postprocessor(decoded_features) if target is not None: - out['loss'] = self.compute_loss(decoded_features, gt, seq_len) + out["loss"] = self.compute_loss(decoded_features, gt, seq_len) return out @@ -563,9 +638,8 @@

Source code for doctr.models.recognition.sar.tensorflow

"""Post processor for SAR architectures Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -580,95 +654,75 @@

Source code for doctr.models.recognition.sar.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: +def _sar( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> SAR: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) - _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units']) - _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units']) - _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length']) - _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) # Feature extractor - feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + pretrained=pretrained_backbone, + input_shape=_cfg["input_shape"], include_top=False, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - kwargs['embedding_units'] = _cfg['embedding_units'] - kwargs['attention_units'] = _cfg['attention_units'] - kwargs['max_length'] = _cfg['max_length'] - kwargs['num_decoders'] = _cfg['num_decoders'] + kwargs["vocab"] = _cfg["vocab"] # Build the model model = SAR(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model -
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - -
-[docs] +[docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import sar_resnet31 + >>> model = sar_resnet31(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the SAR architecture Returns: + ------- text recognition architecture """ - - return _sar('sar_resnet31', pretrained, **kwargs)
+ return _sar("sar_resnet31", pretrained, resnet31, **kwargs)
@@ -702,8 +756,8 @@

Source code for doctr.models.recognition.sar.tensorflow

- +
+ diff --git a/v0.5.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.5.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/v0.5.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.5.0/_modules/doctr/models/recognition/zoo.html b/v0.5.0/_modules/doctr/models/recognition/zoo.html index bf0ae6af6e..f664304019 100644 --- a/v0.5.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.5.0/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
-from doctr.file_utils import is_tf_available, is_torch_available
-from .core import RecognitionPredictor
-from ..preprocessor import PreProcessor
-from .. import recognition
+from doctr.file_utils import is_tf_available
+from doctr.models.preprocessor import PreProcessor
 
+from .. import recognition
+from .predictor import RecognitionPredictor
 
 __all__ = ["recognition_predictor"]
 
 
-if is_tf_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
-elif is_torch_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
+ARCHS: List[str] = [
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "sar_resnet31",
+    "master",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
+]
+
 
+def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+        _model = recognition.__dict__[arch](
+            pretrained=pretrained, pretrained_backbone=kwargs.get("pretrained_backbone", True)
+        )
+    else:
+        if not isinstance(
+            arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq)
+        ):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+        _model = arch
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+    kwargs.pop("pretrained_backbone", None)
 
-    _model = recognition.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 32)
-    predictor = RecognitionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
-        _model
-    )
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 128)
+    input_shape = _model.cfg["input_shape"][:2] if is_tf_available() else _model.cfg["input_shape"][-2:]
+    predictor = RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs), _model)
 
     return predictor
 
 
 
-[docs] -def recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor: +[docs] +def recognition_predictor( + arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + symmetric_pad: bool = False, + batch_size: int = 128, + **kwargs: Any, +) -> RecognitionPredictor: """Text recognition architecture. Example:: @@ -326,14 +369,18 @@

Source code for doctr.models.recognition.zoo

        >>> out = model([input_page])
 
     Args:
-        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
+    ----
+        arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
         pretrained: If True, returns a model pre-trained on our text recognition dataset
+        symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right
+        batch_size: number of samples the model processes in parallel
+        **kwargs: optional parameters to be passed to the architecture
 
     Returns:
+    -------
         Recognition predictor
     """
-
-    return _predictor(arch, pretrained, **kwargs)
+ return _predictor(arch=arch, pretrained=pretrained, symmetric_pad=symmetric_pad, batch_size=batch_size, **kwargs)
@@ -367,8 +414,8 @@

Source code for doctr.models.recognition.zoo

   
-
- +
+ diff --git a/v0.5.0/_modules/doctr/models/zoo.html b/v0.5.0/_modules/doctr/models/zoo.html index dec6857019..d459671648 100644 --- a/v0.5.0/_modules/doctr/models/zoo.html +++ b/v0.5.0/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.models.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from typing import Any
-from .core import OCRPredictor
+
 from .detection.zoo import detection_predictor
+from .kie_predictor import KIEPredictor
+from .predictor import OCRPredictor
 from .recognition.zoo import recognition_predictor
 
+__all__ = ["ocr_predictor", "kie_predictor"]
 
-__all__ = ["ocr_predictor"]
-
-
-def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
 
+def _predictor(
+    det_arch: Any,
+    reco_arch: Any,
+    pretrained: bool,
+    pretrained_backbone: bool = True,
+    assume_straight_pages: bool = True,
+    preserve_aspect_ratio: bool = True,
+    symmetric_pad: bool = True,
+    det_bs: int = 2,
+    reco_bs: int = 128,
+    detect_orientation: bool = False,
+    straighten_pages: bool = False,
+    detect_language: bool = False,
+    **kwargs,
+) -> OCRPredictor:
     # Detection
-    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
+    det_predictor = detection_predictor(
+        det_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=det_bs,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+    )
 
     # Recognition
-    reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
+    reco_predictor = recognition_predictor(
+        reco_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=reco_bs,
+    )
 
-    return OCRPredictor(det_predictor, reco_predictor)
+    return OCRPredictor(
+        det_predictor,
+        reco_predictor,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+        detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
+        detect_language=detect_language,
+        **kwargs,
+    )
 
 
 
-[docs] +[docs] def ocr_predictor( - det_arch: str = 'db_resnet50', - reco_arch: str = 'crnn_vgg16_bn', + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", pretrained: bool = False, - **kwargs: Any + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. - Example:: - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet') + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` Returns: + ------- OCR predictor """ + return _predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
+ + - return _predictor(det_arch, reco_arch, pretrained, **kwargs)
+def _kie_predictor( + det_arch: Any, + reco_arch: Any, + pretrained: bool, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + det_bs: int = 2, + reco_bs: int = 128, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs, +) -> KIEPredictor: + # Detection + det_predictor = detection_predictor( + det_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=det_bs, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + ) + + # Recognition + reco_predictor = recognition_predictor( + reco_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=reco_bs, + ) + + return KIEPredictor( + det_predictor, + reco_predictor, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + ) + + +
+[docs] +def kie_predictor( + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, +) -> KIEPredictor: + """End-to-end KIE architecture using one model for localization, and another for text recognition. + + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) + + Args: + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') + pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` + + Returns: + ------- + KIE predictor + """ + return _kie_predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
@@ -353,8 +575,8 @@

Source code for doctr.models.zoo

       
     
   
- - + + diff --git a/v0.5.0/_modules/doctr/transforms/modules.html b/v0.5.0/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.5.0/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/transforms/modules/base.html b/v0.5.0/_modules/doctr/transforms/modules/base.html index c42079a8fd..4596df3848 100644 --- a/v0.5.0/_modules/doctr/transforms/modules/base.html +++ b/v0.5.0/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.base

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
+import math
 import random
-from typing import List, Any, Callable
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import numpy as np
 
 from doctr.utils.repr import NestedObject
+
 from .. import functional as F
 
+__all__ = ["SampleCompose", "ImageTransform", "ColorInversion", "OneOf", "RandomApply", "RandomRotate", "RandomCrop"]
+
+
+class SampleCompose(NestedObject):
+    """Implements a wrapper that will apply transformations sequentially on both image and target
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfo = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), np.zeros((2, 4)))
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import torch
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfos = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfos(torch.rand(8, 64, 64, 3), np.zeros((2, 4)))
+
+    Args:
+    ----
+        transforms: list of transformation modules
+    """
+
+    _children_names: List[str] = ["sample_transforms"]
+
+    def __init__(self, transforms: List[Callable[[Any, Any], Tuple[Any, Any]]]) -> None:
+        self.sample_transforms = transforms
+
+    def __call__(self, x: Any, target: Any) -> Tuple[Any, Any]:
+        for t in self.sample_transforms:
+            x, target = t(x, target)
+
+        return x, target
+
+
+class ImageTransform(NestedObject):
+    """Implements a transform wrapper to turn an image-only transformation into an image+target transform
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), None)
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import torch
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(torch.rand(8, 64, 64, 3), None)
+
+    Args:
+    ----
+        transform: the image transformation module to wrap
+    """
+
+    _children_names: List[str] = ["img_transform"]
+
+    def __init__(self, transform: Callable[[Any], Any]) -> None:
+        self.img_transform = transform
 
-__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
+    def __call__(self, img: Any, target: Any) -> Tuple[Any, Any]:
+        img = self.img_transform(img)
+        return img, target
 
 
 
-[docs] +[docs] class ColorInversion(NestedObject): """Applies the following tranformation to a tensor (image or batch of images): convert to grayscale, colorize (shift 0-values randomly), and then invert colors - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(torch.rand(8, 64, 64, 3)) Args: + ---- min_val: range [min_val, 1] to colorize RGB pixels """ + def __init__(self, min_val: float = 0.5) -> None: self.min_val = min_val @@ -316,59 +437,178 @@

Source code for doctr.transforms.modules.base

-[docs] +[docs] class OneOf(NestedObject): """Randomly apply one of the input transformations - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transforms: list of transformations, one only will be picked """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: # Pick transformation transfo = self.transforms[int(random.random() * len(self.transforms))] # Apply - return transfo(img)
+ return transfo(img) if target is None else transfo(img, target) # type: ignore[call-arg]
-[docs] +[docs] class RandomApply(NestedObject): """Apply with a probability p the input transformation - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transform: transformation to apply p: probability to apply """ - def __init__(self, transform: Callable[[Any], Any], p: float = .5) -> None: + + def __init__(self, transform: Callable[[Any], Any], p: float = 0.5) -> None: self.transform = transform self.p = p def extra_repr(self) -> str: return f"transform={self.transform}, p={self.p}" - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: if random.random() < self.p: - return self.transform(img) - return img
+ return self.transform(img) if target is None else self.transform(img, target) # type: ignore[call-arg] + return img if target is None else (img, target)
+ + + +
+[docs] +class RandomRotate(NestedObject): + """Randomly rotate a tensor image and its boxes + + .. image:: https://doctr-static.mindee.com/models?id=v0.4.0/rotation_illustration.png&src=0 + :align: center + + Args: + ---- + max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in + [-max_angle, max_angle] + expand: whether the image should be padded before the rotation + """ + + def __init__(self, max_angle: float = 5.0, expand: bool = False) -> None: + self.max_angle = max_angle + self.expand = expand + + def extra_repr(self) -> str: + return f"max_angle={self.max_angle}, expand={self.expand}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + angle = random.uniform(-self.max_angle, self.max_angle) + r_img, r_polys = F.rotate_sample(img, target, angle, self.expand) + # Removes deleted boxes + is_kept = (r_polys.max(1) > r_polys.min(1)).sum(1) == 2 + return r_img, r_polys[is_kept]
+ + + +
+[docs] +class RandomCrop(NestedObject): + """Randomly crop a tensor image and its boxes + + Args: + ---- + scale: tuple of floats, relative (min_area, max_area) of the crop + ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w + """ + + def __init__(self, scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: + self.scale = scale + self.ratio = ratio + + def extra_repr(self) -> str: + return f"scale={self.scale}, ratio={self.ratio}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + scale = random.uniform(self.scale[0], self.scale[1]) + ratio = random.uniform(self.ratio[0], self.ratio[1]) + + height, width = img.shape[:2] + + # Calculate crop size + crop_area = scale * width * height + aspect_ratio = ratio * (width / height) + crop_width = int(round(math.sqrt(crop_area * aspect_ratio))) + crop_height = int(round(math.sqrt(crop_area / aspect_ratio))) + + # Ensure crop size does not exceed image dimensions + crop_width = min(crop_width, width) + crop_height = min(crop_height, height) + + # Randomly select crop position + x = random.randint(0, width - crop_width) + y = random.randint(0, height - crop_height) + + # relative crop box + crop_box = (x / width, y / height, (x + crop_width) / width, (y + crop_height) / height) + if target.shape[1:] == (4, 2): + min_xy = np.min(target, axis=1) + max_xy = np.max(target, axis=1) + _target = np.concatenate((min_xy, max_xy), axis=1) + else: + _target = target + + # Crop image and targets + croped_img, crop_boxes = F.crop_detection(img, _target, crop_box) + # hard fallback if no box is kept + if crop_boxes.shape[0] == 0: + return img, target + # clip boxes + return croped_img, np.clip(crop_boxes, 0, 1)
@@ -402,8 +642,8 @@

Source code for doctr.transforms.modules.base

- - + + diff --git a/v0.5.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.5.0/_modules/doctr/transforms/modules/tensorflow.html index 1d192a876b..acbbe96225 100644 --- a/v0.5.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.5.0/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import random
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
 import tensorflow as tf
-from typing import List, Any, Tuple, Callable
 
 from doctr.utils.repr import NestedObject
 
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'RandomBrightness',
-           'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality']
+from ..functional.tensorflow import _gaussian_filter, random_shadow
+
+__all__ = [
+    "Compose",
+    "Resize",
+    "Normalize",
+    "LambdaTransformation",
+    "ToGray",
+    "RandomBrightness",
+    "RandomContrast",
+    "RandomSaturation",
+    "RandomHue",
+    "RandomGamma",
+    "RandomJpegQuality",
+    "GaussianBlur",
+    "ChannelShuffle",
+    "GaussianNoise",
+    "RandomHorizontalFlip",
+    "RandomShadow",
+    "RandomResize",
+]
 
 
 
-[docs] +[docs] class Compose(NestedObject): """Implements a wrapper that will apply transformations sequentially - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Compose, Resize + >>> transfos = Compose([Resize((32, 32))]) + >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- transforms: list of transformation modules """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms @@ -319,26 +361,27 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class Resize(NestedObject): """Resizes a tensor to a target size - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Resize + >>> transfo = Resize((32, 32)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- output_size: expected output size method: interpolation method preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically """ + def __init__( self, - output_size: Tuple[int, int], - method: str = 'bilinear', + output_size: Union[int, Tuple[int, int]], + method: str = "bilinear", preserve_aspect_ratio: bool = False, symmetric_pad: bool = False, ) -> None: @@ -346,6 +389,14 @@

Source code for doctr.transforms.modules.tensorflow

self.method = method self.preserve_aspect_ratio = preserve_aspect_ratio self.symmetric_pad = symmetric_pad + self.antialias = True + + if isinstance(self.output_size, int): + self.wanted_size = (self.output_size, self.output_size) + elif isinstance(self.output_size, (tuple, list)): + self.wanted_size = self.output_size + else: + raise AssertionError("Output size should be either a list, a tuple or an int") def extra_repr(self) -> str: _repr = f"output_size={self.output_size}, method='{self.method}'" @@ -353,64 +404,106 @@

Source code for doctr.transforms.modules.tensorflow

_repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" return _repr - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) + def __call__( + self, + img: tf.Tensor, + target: Optional[np.ndarray] = None, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: + input_dtype = img.dtype + self.output_size = ( + (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size + ) + + img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias) + # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio + raw_shape = img.shape[:2] + if self.symmetric_pad: + half_pad = (int((self.output_size[0] - img.shape[0]) / 2), 0) if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
+ if isinstance(self.output_size, (tuple, list)): + # In that case we need to pad because we want to enforce both width and height + if not self.symmetric_pad: + half_pad = (0, 0) + elif self.output_size[0] == img.shape[0]: + half_pad = (0, int((self.output_size[1] - img.shape[1]) / 2)) + # Pad image + img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size) + + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) + if target is not None: + if self.symmetric_pad: + offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1] + + if self.preserve_aspect_ratio: + # Get absolute coords + if target.shape[1:] == (4,): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1] + target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0] + else: + target[:, [0, 2]] *= raw_shape[1] / img.shape[1] + target[:, [1, 3]] *= raw_shape[0] / img.shape[0] + elif target.shape[1:] == (4, 2): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1] + target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0] + else: + target[..., 0] *= raw_shape[1] / img.shape[1] + target[..., 1] *= raw_shape[0] / img.shape[0] + else: + raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") + + return tf.cast(img, dtype=input_dtype), np.clip(target, 0, 1) + + return tf.cast(img, dtype=input_dtype)
-[docs] +[docs] class Normalize(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Normalize + >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- mean: average value per channel std: standard deviation per channel """ + def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) + self.mean = tf.constant(mean) + self.std = tf.constant(std) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std + img -= tf.cast(self.mean, dtype=img.dtype) + img /= tf.cast(self.std, dtype=img.dtype) return img
-[docs] +[docs] class LambdaTransformation(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import LambdaTransformation + >>> transfo = LambdaTransformation(lambda x: x/ 255.) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- fn: the function to be applied to the input tensor """ + def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: self.fn = fn @@ -420,37 +513,42 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class ToGray(NestedObject): """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import ToGray + >>> transfo = ToGray() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) """ + + def __init__(self, num_output_channels: int = 1): + self.num_output_channels = num_output_channels + def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
+ img = tf.image.rgb_to_grayscale(img) + return img if self.num_output_channels == 1 else tf.repeat(img, self.num_output_channels, axis=-1)
-[docs] +[docs] class RandomBrightness(NestedObject): """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta to all pixels - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomBrightness + >>> transfo = RandomBrightness() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] p: probability to apply transformation """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -463,21 +561,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomContrast(NestedObject): """Randomly adjust contrast of a tensor (batch of images or image) by adjusting each pixel: (img - mean) * contrast_factor + mean. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomContrast + >>> transfo = RandomContrast() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) """ - def __init__(self, delta: float = .3) -> None: + + def __init__(self, delta: float = 0.3) -> None: self.delta = delta def extra_repr(self) -> str: @@ -489,21 +588,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomSaturation(NestedObject): """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and increasing saturation by a factor. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomSaturation + >>> transfo = RandomSaturation() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) """ - def __init__(self, delta: float = .5) -> None: + + def __init__(self, delta: float = 0.5) -> None: self.delta = delta def extra_repr(self) -> str: @@ -515,19 +615,20 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomHue(NestedObject): """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHue + >>> transfo = RandomHue() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -540,22 +641,23 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomGamma(NestedObject): """randomly performs gamma correction for a tensor (batch of images or image) - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomGamma + >>> transfo = RandomGamma() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- min_gamma: non-negative real number, lower bound for gamma param max_gamma: non-negative real number, upper bound for gamma min_gain: lower bound for constant multiplier max_gain: upper bound for constant multiplier """ + def __init__( self, min_gamma: float = 0.5, @@ -580,20 +682,21 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomJpegQuality(NestedObject): """Randomly adjust jpeg quality of a 3 dimensional RGB image - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomJpegQuality + >>> transfo = RandomJpegQuality() + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- min_quality: int between [0, 100] max_quality: int between [0, 100] """ + def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: self.min_quality = min_quality self.max_quality = max_quality @@ -602,10 +705,224 @@

Source code for doctr.transforms.modules.tensorflow

return f"min_quality={self.min_quality}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality + return tf.image.random_jpeg_quality(img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality)
+ + + +
+[docs] +class GaussianBlur(NestedObject): + """Randomly adjust jpeg quality of a 3 dimensional RGB image + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianBlur + >>> transfo = GaussianBlur(3, (.1, 5)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + kernel_shape: size of the blurring kernel + std: min and max value of the standard deviation + """ + + def __init__(self, kernel_shape: Union[int, Iterable[int]], std: Tuple[float, float]) -> None: + self.kernel_shape = kernel_shape + self.std = std + + def extra_repr(self) -> str: + return f"kernel_shape={self.kernel_shape}, std={self.std}" + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.squeeze( + _gaussian_filter( + img[tf.newaxis, ...], + kernel_size=self.kernel_shape, + sigma=random.uniform(self.std[0], self.std[1]), + mode="REFLECT", + ), + axis=0, )
+ + +
+[docs] +class ChannelShuffle(NestedObject): + """Randomly shuffle channel order of a given image""" + + def __init__(self): + pass + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.transpose(tf.random.shuffle(tf.transpose(img, perm=[2, 0, 1])), perm=[1, 2, 0])
+ + + +
+[docs] +class GaussianNoise(NestedObject): + """Adds Gaussian Noise to the input tensor + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianNoise + >>> transfo = GaussianNoise(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + mean : mean of the gaussian distribution + std : std of the gaussian distribution + """ + + def __init__(self, mean: float = 0.0, std: float = 1.0) -> None: + super().__init__() + self.std = std + self.mean = mean + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + noise = self.mean + 2 * self.std * tf.random.uniform(x.shape) - self.std + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value(tf.math.round(tf.cast(x, dtype=tf.float32) + 255 * noise), 0, 255), dtype=tf.uint8 + ) + else: + return tf.cast(tf.clip_by_value(x + noise, 0, 1), dtype=x.dtype) + + def extra_repr(self) -> str: + return f"mean={self.mean}, std={self.std}"
+ + + +
+[docs] +class RandomHorizontalFlip(NestedObject): + """Adds random horizontal flip to the input tensor/np.ndarray + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHorizontalFlip + >>> transfo = RandomHorizontalFlip(p=0.5) + >>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1) + >>> target = np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32) + >>> out = transfo(image, target) + + Args: + ---- + p : probability of Horizontal Flip + """ + + def __init__(self, p: float) -> None: + super().__init__() + self.p = p + + def __call__(self, img: Union[tf.Tensor, np.ndarray], target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + _img = tf.image.flip_left_right(img) + _target = target.copy() + # Changing the relative bbox coordinates + if target.shape[1:] == (4,): + _target[:, ::2] = 1 - target[:, [2, 0]] + else: + _target[..., 0] = 1 - target[..., 0] + return _img, _target + return img, target
+ + + +
+[docs] +class RandomShadow(NestedObject): + """Adds random shade to the input image + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomShadow + >>> transfo = RandomShadow(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + opacity_range : minimum and maximum opacity of the shade + """ + + def __init__(self, opacity_range: Optional[Tuple[float, float]] = None) -> None: + super().__init__() + self.opacity_range = opacity_range if isinstance(opacity_range, tuple) else (0.2, 0.8) + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value( + tf.math.round(255 * random_shadow(tf.cast(x, dtype=tf.float32) / 255, self.opacity_range)), + 0, + 255, + ), + dtype=tf.uint8, + ) + else: + return tf.clip_by_value(random_shadow(x, self.opacity_range), 0, 1) + + def extra_repr(self) -> str: + return f"opacity_range={self.opacity_range}"
+ + + +
+[docs] +class RandomResize(NestedObject): + """Randomly resize the input image and align corresponding targets + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomResize + >>> transfo = RandomResize((0.3, 0.9), preserve_aspect_ratio=True, symmetric_pad=True, p=0.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + scale_range: range of the resizing factor for width and height (independently) + preserve_aspect_ratio: whether to preserve the aspect ratio of the image, + given a float value, the aspect ratio will be preserved with this probability + symmetric_pad: whether to symmetrically pad the image, + given a float value, the symmetric padding will be applied with this probability + p: probability to apply the transformation + """ + + def __init__( + self, + scale_range: Tuple[float, float] = (0.3, 0.9), + preserve_aspect_ratio: Union[bool, float] = False, + symmetric_pad: Union[bool, float] = False, + p: float = 0.5, + ): + super().__init__() + self.scale_range = scale_range + self.preserve_aspect_ratio = preserve_aspect_ratio + self.symmetric_pad = symmetric_pad + self.p = p + self._resize = Resize + + def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + scale_h = random.uniform(*self.scale_range) + scale_w = random.uniform(*self.scale_range) + new_size = (int(img.shape[-3] * scale_h), int(img.shape[-2] * scale_w)) + + _img, _target = self._resize( + new_size, + preserve_aspect_ratio=self.preserve_aspect_ratio + if isinstance(self.preserve_aspect_ratio, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + symmetric_pad=self.symmetric_pad + if isinstance(self.symmetric_pad, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + )(img, target) + + return _img, _target + return img, target + + def extra_repr(self) -> str: + return f"scale_range={self.scale_range}, preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}, p={self.p}" # noqa: E501
+
@@ -638,8 +955,8 @@

Source code for doctr.transforms.modules.tensorflow

- +
+ diff --git a/v0.5.0/_modules/doctr/utils/metrics.html b/v0.5.0/_modules/doctr/utils/metrics.html index 460c64a385..8a37d5949a 100644 --- a/v0.5.0/_modules/doctr/utils/metrics.html +++ b/v0.5.0/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.metrics

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-import cv2
-from typing import List, Tuple, Dict, Optional
-from unidecode import unidecode
+from anyascii import anyascii
 from scipy.optimize import linear_sum_assignment
-from doctr.utils.geometry import rbbox_to_polygon
+from shapely.geometry import Polygon
 
-__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'rbox_to_mask',
-           'nms', 'LocalizationConfusion', 'OCRMetric']
+__all__ = [
+    "TextMatch",
+    "box_iou",
+    "polygon_iou",
+    "nms",
+    "LocalizationConfusion",
+    "OCRMetric",
+    "DetectionMetric",
+]
 
 
 def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
-    """Perform string comparison with multiple levels of tolerance
+    """Performs string comparison with multiple levels of tolerance
 
     Args:
+    ----
         word1: a string
         word2: another string
 
     Returns:
+    -------
         a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
-            unidecode counterparts and their lower-case unidecode counterparts match
+            anyascii counterparts and their lower-case anyascii counterparts match
     """
-    raw_match = (word1 == word2)
-    caseless_match = (word1.lower() == word2.lower())
-    unidecode_match = (unidecode(word1) == unidecode(word2))
+    raw_match = word1 == word2
+    caseless_match = word1.lower() == word2.lower()
+    anyascii_match = anyascii(word1) == anyascii(word2)
 
     # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
-    unicase_match = (unidecode(word1).lower() == unidecode(word2).lower())
+    unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
 
-    return raw_match, caseless_match, unidecode_match, unicase_match
+    return raw_match, caseless_match, anyascii_match, unicase_match
 
 
 
-[docs] +[docs] class TextMatch: - """Implements text match metric (word-level accuracy) for recognition task. + r"""Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \\forall X, Y \\in \\mathcal{W}^N, - TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) + \forall X, Y \in \mathcal{W}^N, + TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \\forall a, x \\in \\mathcal{W}, - f_a(x) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } x = a \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{W}` is the set of all possible character sequences, + \forall a, x \in \mathcal{W}, + f_a(x) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } x = a \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. - Example:: - >>> from doctr.utils import TextMatch - >>> metric = TextMatch() - >>> metric.update(['Hello', 'world'], ['hello', 'world']) - >>> metric.summary() + >>> from doctr.utils import TextMatch + >>> metric = TextMatch() + >>> metric.update(['Hello', 'world'], ['hello', 'world']) + >>> metric.summary() """ def __init__(self) -> None: self.reset() +
+[docs] def update( self, gt: List[str], @@ -354,29 +386,32 @@

Source code for doctr.utils.metrics

         """Update the state of the metric with new predictions
 
         Args:
+        ----
             gt: list of groung-truth character sequences
-            pred: list of predicted character sequences"""
-
+            pred: list of predicted character sequences
+        """
         if len(gt) != len(pred):
             raise AssertionError("prediction size does not match with ground-truth labels size")
 
         for gt_word, pred_word in zip(gt, pred):
-            _raw, _caseless, _unidecode, _unicase = string_match(gt_word, pred_word)
+            _raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
             self.raw += int(_raw)
             self.caseless += int(_caseless)
-            self.unidecode += int(_unidecode)
+            self.anyascii += int(_anyascii)
             self.unicase += int(_unicase)
 
-        self.total += len(gt)
+        self.total += len(gt)
+
-[docs] +[docs] def summary(self) -> Dict[str, float]: """Computes the aggregated metrics - Returns: - a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode - counterpart and its lower-case unidecode counterpart + Returns + ------- + a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii + counterpart and its lower-case anyascii counterpart """ if self.total == 0: raise AssertionError("you need to update the metric before getting the summary") @@ -384,7 +419,7 @@

Source code for doctr.utils.metrics

         return dict(
             raw=self.raw / self.total,
             caseless=self.caseless / self.total,
-            unidecode=self.unidecode / self.total,
+            anyascii=self.anyascii / self.total,
             unicase=self.unicase / self.total,
         )
@@ -392,23 +427,25 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.raw = 0
         self.caseless = 0
-        self.unidecode = 0
+        self.anyascii = 0
         self.unicase = 0
         self.total = 0
def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray: - """Compute the IoU between two sets of bounding boxes + """Computes the IoU between two sets of bounding boxes Args: + ---- boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax) boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax) + Returns: + ------- the IoU matrix of shape (N, M) """ - - iou_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) + iou_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0: l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1) @@ -419,107 +456,54 @@

Source code for doctr.utils.metrics

         right = np.minimum(r1, r2.T)
         bot = np.minimum(b1, b2.T)
 
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
+        intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
         union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
         iou_mat = intersection / union
 
     return iou_mat
 
 
-def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
-    """Compute the IoA (intersection over area) between two sets of bounding boxes:
-    ioa(i, j) = inter(i, j) / area(i)
-
-    Args:
-        boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
-        boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
-    Returns:
-        the IoA matrix of shape (N, M)
-    """
-
-    ioa_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
-
-    if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
-        l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
-        l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
-
-        left = np.maximum(l1, l2.T)
-        top = np.maximum(t1, t2.T)
-        right = np.minimum(r1, r2.T)
-        bot = np.minimum(b1, b2.T)
-
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
-        area = (r1 - l1) * (b1 - t1)
-        ioa_mat = intersection / area
-
-    return ioa_mat
-
-
-def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
-    """Compute the IoU between two sets of boolean masks
+def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
+    """Computes the IoU between two sets of rotated bounding boxes
 
     Args:
-        masks_1: boolean masks of shape (N, H, W)
-        masks_2: boolean masks of shape (M, H, W)
+    ----
+        polys_1: rotated bounding boxes of shape (N, 4, 2)
+        polys_2: rotated bounding boxes of shape (M, 4, 2)
+        mask_shape: spatial shape of the intermediate masks
+        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
 
     Returns:
+    -------
         the IoU matrix of shape (N, M)
     """
+    if polys_1.ndim != 3 or polys_2.ndim != 3:
+        raise AssertionError("expects boxes to be in format (N, 4, 2)")
 
-    if masks_1.shape[1:] != masks_2.shape[1:]:
-        raise AssertionError("both boolean masks should have the same spatial shape")
+    iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
 
-    iou_mat = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
+    shapely_polys_1 = [Polygon(poly) for poly in polys_1]
+    shapely_polys_2 = [Polygon(poly) for poly in polys_2]
 
-    if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
-        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...])
-        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...])
-        axes = tuple(range(2, masks_1.ndim + 1))
-        iou_mat = intersection.sum(axis=axes) / union.sum(axis=axes)
+    for i, poly1 in enumerate(shapely_polys_1):
+        for j, poly2 in enumerate(shapely_polys_2):
+            intersection_area = poly1.intersection(poly2).area
+            union_area = poly1.area + poly2.area - intersection_area
+            iou_mat[i, j] = intersection_area / union_area
 
     return iou_mat
 
 
-def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
-    """Convert boxes to masks
-
-    Args:
-        boxes: rotated bounding boxes of shape (N, 5) in format (x, y, w, h, alpha)
-        shape: spatial shapes of the output masks
-
-    Returns:
-        the boolean masks of shape (N, H, W)
-    """
-
-    masks = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
-
-    if boxes.shape[0] > 0:
-        # Get absolute coordinates
-        if boxes.dtype != np.int:
-            abs_boxes = boxes.copy()
-            abs_boxes[:, [0, 2]] = abs_boxes[:, [0, 2]] * shape[1]
-            abs_boxes[:, [1, 3]] = abs_boxes[:, [1, 3]] * shape[0]
-            abs_boxes = abs_boxes.round().astype(np.int)
-        else:
-            abs_boxes = boxes
-            abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
-
-        # TODO: optimize slicing to improve vectorization
-        for idx, _box in enumerate(abs_boxes):
-            box = rbbox_to_polygon(_box)
-            cv2.fillPoly(masks[idx], [np.array(box, np.int32)], 1)
-
-    return masks.astype(bool)
-
-
-def nms(boxes: np.ndarray, thresh: float = .5) -> List[int]:
+def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
     """Perform non-max suppression, borrowed from <https://github.com/rbgirshick/fast-rcnn>`_.
 
     Args:
+    ----
         boxes: np array of straight boxes: (*, 5), (xmin, ymin, xmax, ymax, score)
         thresh: iou threshold to perform box suppression.
 
     Returns:
+    -------
         A list of box indexes to keep
     """
     x1 = boxes[:, 0]
@@ -551,66 +535,71 @@ 

Source code for doctr.utils.metrics

 
 
 
-[docs] +[docs] class LocalizationConfusion: - """Implements common confusion metrics and mean IoU for localization evaluation. + r"""Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ - Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) + \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ + Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ + Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ + meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \\forall y \\in \\mathcal{B}, - g_X(y) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, + \forall y \in \mathcal{B}, + g_X(y) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import LocalizationConfusion - >>> metric = LocalizationConfusion(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import LocalizationConfusion + >>> metric = LocalizationConfusion(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: + """Updates the metric + Args: + ---- + gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + """ if preds.shape[0] > 0: # Compute IoU - if self.rotated_bbox: - mask_gts = rbox_to_mask(gts, shape=self.mask_shape) - mask_preds = rbox_to_mask(preds, shape=self.mask_shape) - iou_mat = mask_iou(mask_gts, mask_preds) + if self.use_polygons: + iou_mat = polygon_iou(gts, preds) else: iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=1).sum()) + self.tot_iou += float(iou_mat.max(axis=0).sum()) # Assign pairs gt_indices, pred_indices = linear_sum_assignment(-iou_mat) @@ -618,17 +607,18 @@

Source code for doctr.utils.metrics

 
         # Update counts
         self.num_gts += gts.shape[0]
-        self.num_preds += preds.shape[0]
+        self.num_preds += preds.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: """Computes the aggregated metrics - Returns: + Returns + ------- a tuple with the recall, precision and meanIoU scores """ - # Recall recall = self.matches / self.num_gts if self.num_gts > 0 else None @@ -636,7 +626,7 @@

Source code for doctr.utils.metrics

         precision = self.matches / self.num_preds if self.num_preds > 0 else None
 
         # mean IoU
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -645,64 +635,65 @@

Source code for doctr.utils.metrics

         self.num_gts = 0
         self.num_preds = 0
         self.matches = 0
-        self.tot_iou = 0.
+ self.tot_iou = 0.0
-[docs] +[docs] class OCRMetric: - """Implements end-to-end OCR metric. + r"""Implements an end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, - \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ - Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) + \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, + \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ + Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, - h_{B,L}(b, l) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ - & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, - :math:`\\mathcal{L}` is the set of possible character sequences, + \forall (b, l) \in \mathcal{B} \times \mathcal{L}, + h_{B,L}(b, l) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import OCRMetric - >>> metric = OCRMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - ['hello'], ['hello', 'world']) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import OCRMetric + >>> metric = OCRMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> ['hello'], ['hello', 'world']) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update( self, gt_boxes: np.ndarray, @@ -710,50 +701,58 @@

Source code for doctr.utils.metrics

         gt_labels: List[str],
         pred_labels: List[str],
     ) -> None:
+        """Updates the metric
 
+        Args:
+        ----
+            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
+            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
+            gt_labels: a list of N string labels
+            pred_labels: a list of M string labels
+        """
         if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
-            raise AssertionError("there should be the same number of boxes and string both for the ground truth "
-                                 "and the predictions")
+            raise AssertionError(
+                "there should be the same number of boxes and string both for the ground truth and the predictions"
+            )
 
         # Compute IoU
         if pred_boxes.shape[0] > 0:
-            if self.rotated_bbox:
-                mask_gts = rbox_to_mask(gt_boxes, shape=self.mask_shape)
-                mask_preds = rbox_to_mask(pred_boxes, shape=self.mask_shape)
-                iou_mat = mask_iou(mask_gts, mask_preds)
+            if self.use_polygons:
+                iou_mat = polygon_iou(gt_boxes, pred_boxes)
             else:
                 iou_mat = box_iou(gt_boxes, pred_boxes)
 
-            self.tot_iou += float(iou_mat.max(axis=1).sum())
+            self.tot_iou += float(iou_mat.max(axis=0).sum())
 
             # Assign pairs
             gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
             is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
             # String comparison
             for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
-                _raw, _caseless, _unidecode, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
+                _raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
                 self.raw_matches += int(_raw)
                 self.caseless_matches += int(_caseless)
-                self.unidecode_matches += int(_unidecode)
+                self.anyascii_matches += int(_anyascii)
                 self.unicase_matches += int(_unicase)
 
         self.num_gts += gt_boxes.shape[0]
-        self.num_preds += pred_boxes.shape[0]
+        self.num_preds += pred_boxes.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float]], Optional[float]]: """Computes the aggregated metrics - Returns: - a tuple with the recall & precision for each string comparison flexibility and the mean IoU + Returns + ------- + a tuple with the recall & precision for each string comparison and the mean IoU """ - # Recall recall = dict( raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None, caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None, - unidecode=self.unidecode_matches / self.num_gts if self.num_gts > 0 else None, + anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None, unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None, ) @@ -761,12 +760,12 @@

Source code for doctr.utils.metrics

         precision = dict(
             raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
             caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
-            unidecode=self.unidecode_matches / self.num_preds if self.num_preds > 0 else None,
+            anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
             unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
         )
 
         # mean IoU (overall detected boxes)
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -774,12 +773,136 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.num_gts = 0
         self.num_preds = 0
-        self.tot_iou = 0.
+        self.tot_iou = 0.0
         self.raw_matches = 0
         self.caseless_matches = 0
-        self.unidecode_matches = 0
+        self.anyascii_matches = 0
         self.unicase_matches = 0
+ + +
+[docs] +class DetectionMetric: + r"""Implements an object detection metric. + + The aggregated metrics are computed as follows: + + .. math:: + \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, + \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ + Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + + with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and + :math:`y`, and the function :math:`h_{B, C}` defined as: + + .. math:: + \forall (b, c) \in \mathcal{B} \times \mathcal{C}, + h_{B,C}(b, c) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{C}` is the set of possible class indices, + :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. + + >>> import numpy as np + >>> from doctr.utils import DetectionMetric + >>> metric = DetectionMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) + >>> metric.summary() + + Args: + ---- + iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format + """ + + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: + self.iou_thresh = iou_thresh + self.use_polygons = use_polygons + self.reset() + +
+[docs] + def update( + self, + gt_boxes: np.ndarray, + pred_boxes: np.ndarray, + gt_labels: np.ndarray, + pred_labels: np.ndarray, + ) -> None: + """Updates the metric + + Args: + ---- + gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + gt_labels: an array of class indices of shape (N,) + pred_labels: an array of class indices of shape (M,) + """ + if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: + raise AssertionError( + "there should be the same number of boxes and string both for the ground truth and the predictions" + ) + + # Compute IoU + if pred_boxes.shape[0] > 0: + if self.use_polygons: + iou_mat = polygon_iou(gt_boxes, pred_boxes) + else: + iou_mat = box_iou(gt_boxes, pred_boxes) + + self.tot_iou += float(iou_mat.max(axis=0).sum()) + + # Assign pairs + gt_indices, pred_indices = linear_sum_assignment(-iou_mat) + is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh + # Category comparison + self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) + + self.num_gts += gt_boxes.shape[0] + self.num_preds += pred_boxes.shape[0]
+ + +
+[docs] + def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: + """Computes the aggregated metrics + + Returns + ------- + a tuple with the recall & precision for each class prediction and the mean IoU + """ + # Recall + recall = self.num_matches / self.num_gts if self.num_gts > 0 else None + + # Precision + precision = self.num_matches / self.num_preds if self.num_preds > 0 else None + + # mean IoU (overall detected boxes) + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None + + return recall, precision, mean_iou
+ + + def reset(self) -> None: + self.num_gts = 0 + self.num_preds = 0 + self.tot_iou = 0.0 + self.num_matches = 0
+
@@ -812,8 +935,8 @@

Source code for doctr.utils.metrics

       
     
   
- - + + diff --git a/v0.5.0/_modules/doctr/utils/visualization.html b/v0.5.0/_modules/doctr/utils/visualization.html index 8e7dcca811..c818be6d7b 100644 --- a/v0.5.0/_modules/doctr/utils/visualization.html +++ b/v0.5.0/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.visualization

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import colorsys
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import matplotlib.pyplot as plt
-from matplotlib.figure import Figure
+import cv2
 import matplotlib.patches as patches
-import mplcursors
-from PIL import ImageFont, ImageDraw, Image
+import matplotlib.pyplot as plt
 import numpy as np
-import cv2
-from typing import Tuple, List, Dict, Any, Union
+from matplotlib.figure import Figure
 
-from .common_types import BoundingBox, RotatedBbox
+from .common_types import BoundingBox, Polygon4P
 
-__all__ = ['visualize_page', 'synthetize_page']
+__all__ = ["visualize_page", "visualize_kie_page", "draw_boxes"]
 
 
-def create_rect_patch(
-    geometry: Union[BoundingBox, RotatedBbox],
-    label: str,
+def rect_patch(
+    geometry: BoundingBox,
     page_dimensions: Tuple[int, int],
-    color: Tuple[int, int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
     alpha: float = 0.3,
     linewidth: int = 2,
     fill: bool = True,
-) -> patches.Patch:
-    """Create a matplotlib patch (rectangle) bounding the element
+    preserve_aspect_ratio: bool = False,
+) -> patches.Rectangle:
+    """Create a matplotlib rectangular patch for the element
 
     Args:
+    ----
         geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
         label: label to display when hovered
-        page_dimensions: dimensions of the Page
         color: color to draw box
         alpha: opacity parameter to fill the boxes, 0 = transparent
         linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
 
     Returns:
+    -------
         a rectangular Patch
     """
+    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
     height, width = page_dimensions
-    if len(geometry) == 5:
-        x, y, w, h, a = geometry  # type: ignore[misc]
-        x, w = x * width, w * width
-        y, h = y * height, h * height
-        points = cv2.boxPoints(((x, y), (w, h), a))
-        return patches.Polygon(
-            points,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
-    else:
-        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
-        xmin, xmax = xmin * width, xmax * width
-        ymin, ymax = ymin * height, ymax * height
-        return patches.Rectangle(
-            (xmin, ymin),
-            xmax - xmin,
-            ymax - ymin,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
+    (xmin, ymin), (xmax, ymax) = geometry
+    # Switch to absolute coords
+    if preserve_aspect_ratio:
+        width = height = max(height, width)
+    xmin, w = xmin * width, (xmax - xmin) * width
+    ymin, h = ymin * height, (ymax - ymin) * height
+
+    return patches.Rectangle(
+        (xmin, ymin),
+        w,
+        h,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def polygon_patch(
+    geometry: np.ndarray,
+    page_dimensions: Tuple[int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
+    alpha: float = 0.3,
+    linewidth: int = 2,
+    fill: bool = True,
+    preserve_aspect_ratio: bool = False,
+) -> patches.Polygon:
+    """Create a matplotlib polygon patch for the element
+
+    Args:
+    ----
+        geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
+        label: label to display when hovered
+        color: color to draw box
+        alpha: opacity parameter to fill the boxes, 0 = transparent
+        linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
+
+    Returns:
+    -------
+        a polygon Patch
+    """
+    if not geometry.shape == (4, 2):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
+    height, width = page_dimensions
+    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
+    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
+
+    return patches.Polygon(
+        geometry,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def create_obj_patch(
+    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
+    page_dimensions: Tuple[int, int],
+    **kwargs: Any,
+) -> patches.Patch:
+    """Create a matplotlib patch for the element
+
+    Args:
+    ----
+        geometry: bounding box (straight or rotated) of the element
+        page_dimensions: dimensions of the page in format (height, width)
+        **kwargs: keyword arguments for the patch
+
+    Returns:
+    -------
+        a matplotlib Patch
+    """
+    if isinstance(geometry, tuple):
+        if len(geometry) == 2:  # straight word BB (2 pts)
+            return rect_patch(geometry, page_dimensions, **kwargs)
+        elif len(geometry) == 4:  # rotated word BB (4 pts)
+            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)
+    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
+        return polygon_patch(geometry, page_dimensions, **kwargs)
+    raise ValueError("invalid geometry format")
+
+
+def get_colors(num_colors: int) -> List[Tuple[float, float, float]]:
+    """Generate num_colors color for matplotlib
+
+    Args:
+    ----
+        num_colors: number of colors to generate
+
+    Returns:
+    -------
+        colors: list of generated colors
+    """
+    colors = []
+    for i in np.arange(0.0, 360.0, 360.0 / num_colors):
+        hue = i / 360.0
+        lightness = (50 + np.random.rand() * 10) / 100.0
+        saturation = (90 + np.random.rand() * 10) / 100.0
+        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
+    return colors
 
 
 
-[docs] +[docs] def visualize_page( page: Dict[str, Any], image: np.ndarray, @@ -359,18 +472,18 @@

Source code for doctr.utils.visualization

 ) -> Figure:
     """Visualize a full page with predicted blocks, lines and words
 
-    Example::
-        >>> import numpy as np
-        >>> import matplotlib.pyplot as plt
-        >>> from doctr.utils.visualization import visualize_page
-        >>> from doctr.models import ocr_db_crnn
-        >>> model = ocr_db_crnn(pretrained=True)
-        >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
-        >>> out = model([[input_page]])
-        >>> visualize_page(out[0].pages[0].export(), input_page)
-        >>> plt.show()
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from doctr.utils.visualization import visualize_page
+    >>> from doctr.models import ocr_db_crnn
+    >>> model = ocr_db_crnn(pretrained=True)
+    >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
+    >>> out = model([[input_page]])
+    >>> visualize_page(out[0].pages[0].export(), input_page)
+    >>> plt.show()
 
     Args:
+    ----
         page: the exported Page of a Document
         image: np array of the page, needs to have the same shape than page['dimensions']
         words_only: whether only words should be displayed
@@ -378,6 +491,11 @@ 

Source code for doctr.utils.visualization

         scale: figsize of the largest windows side
         interactive: whether the plot should be interactive
         add_labels: for static plot, adds text labels on top of bounding box
+        **kwargs: keyword arguments for the polygon patch
+
+    Returns:
+    -------
+        the matplotlib figure
     """
     # Get proper scale and aspect ratio
     h, w = image.shape[:2]
@@ -386,128 +504,189 @@ 

Source code for doctr.utils.visualization

     # Display the image
     ax.imshow(image)
     # hide both axis
-    ax.axis('off')
+    ax.axis("off")
 
     if interactive:
         artists: List[patches.Patch] = []  # instantiate an empty list of patches (to be drawn on the page)
 
-    for block in page['blocks']:
+    for block in page["blocks"]:
         if not words_only:
-            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
+            rect = create_obj_patch(
+                block["geometry"], page["dimensions"], label="block", color=(0, 1, 0), linewidth=1, **kwargs
+            )
             # add patch on figure
             ax.add_patch(rect)
             if interactive:
                 # add patch to cursor's artists
                 artists.append(rect)
 
-        for line in block['lines']:
+        for line in block["lines"]:
             if not words_only:
-                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
+                rect = create_obj_patch(
+                    line["geometry"], page["dimensions"], label="line", color=(1, 0, 0), linewidth=1, **kwargs
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
-            for word in line['words']:
-                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
-                                         page['dimensions'], (0, 0, 1), **kwargs)
+            for word in line["words"]:
+                rect = create_obj_patch(
+                    word["geometry"],
+                    page["dimensions"],
+                    label=f"{word['value']} (confidence: {word['confidence']:.2%})",
+                    color=(0, 0, 1),
+                    **kwargs,
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
                 elif add_labels:
-                    if len(word['geometry']) == 5:
+                    if len(word["geometry"]) == 5:
                         text_loc = (
-                            int(page['dimensions'][1] * (word['geometry'][0] - word['geometry'][2] / 2)),
-                            int(page['dimensions'][0] * (word['geometry'][1] - word['geometry'][3] / 2))
+                            int(page["dimensions"][1] * (word["geometry"][0] - word["geometry"][2] / 2)),
+                            int(page["dimensions"][0] * (word["geometry"][1] - word["geometry"][3] / 2)),
                         )
                     else:
                         text_loc = (
-                            int(page['dimensions'][1] * word['geometry'][0][0]),
-                            int(page['dimensions'][0] * word['geometry'][0][1])
+                            int(page["dimensions"][1] * word["geometry"][0][0]),
+                            int(page["dimensions"][0] * word["geometry"][0][1]),
+                        )
+
+                    if len(word["geometry"]) == 2:
+                        # We draw only if boxes are in straight format
+                        ax.text(
+                            *text_loc,
+                            word["value"],
+                            size=10,
+                            alpha=0.5,
+                            color=(0, 0, 1),
                         )
-                    ax.text(
-                        *text_loc,
-                        word['value'],
-                        size=10,
-                        alpha=0.5,
-                        color=(0, 0, 1),
-                    )
 
         if display_artefacts:
-            for artefact in block['artefacts']:
-                rect = create_rect_patch(
-                    artefact['geometry'],
-                    'artefact',
-                    page['dimensions'],
-                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
+            for artefact in block["artefacts"]:
+                rect = create_obj_patch(
+                    artefact["geometry"],
+                    page["dimensions"],
+                    label="artefact",
+                    color=(0.5, 0.5, 0.5),
                     linewidth=1,
-                    **kwargs
+                    **kwargs,
                 )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
     if interactive:
+        import mplcursors
+
         # Create mlp Cursor to hover patches in artists
         mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
-    fig.tight_layout(pad=0.)
+    fig.tight_layout(pad=0.0)
 
     return fig
-def synthetize_page( +def visualize_kie_page( page: Dict[str, Any], - draw_proba: bool = False, - font_size: int = 13, -) -> np.ndarray: - """Draw a the content of the element page (OCR response) on a blank page. + image: np.ndarray, + words_only: bool = False, + display_artefacts: bool = True, + scale: float = 10, + interactive: bool = True, + add_labels: bool = True, + **kwargs: Any, +) -> Figure: + """Visualize a full page with predicted blocks, lines and words + + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from doctr.utils.visualization import visualize_page + >>> from doctr.models import ocr_db_crnn + >>> model = ocr_db_crnn(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([[input_page]]) + >>> visualize_kie_page(out[0].pages[0].export(), input_page) + >>> plt.show() Args: - page: exported Page object to represent - draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 - font_size: size of the font, default font = 13 + ---- + page: the exported Page of a Document + image: np array of the page, needs to have the same shape than page['dimensions'] + words_only: whether only words should be displayed + display_artefacts: whether artefacts should be displayed + scale: figsize of the largest windows side + interactive: whether the plot should be interactive + add_labels: for static plot, adds text labels on top of bounding box + **kwargs: keyword arguments for the polygon patch - Return: - A np array (drawn page) + Returns: + ------- + the matplotlib figure """ - # Draw template - h, w = page["dimensions"] - response = 255 * np.ones((h, w, 3), dtype=np.int32) + # Get proper scale and aspect ratio + h, w = image.shape[:2] + size = (scale * w / h, scale) if h > w else (scale, h / w * scale) + fig, ax = plt.subplots(figsize=size) + # Display the image + ax.imshow(image) + # hide both axis + ax.axis("off") - # Draw each word - for block in page["blocks"]: - for line in block["lines"]: - for word in line["words"]: - # Get aboslute word geometry - (xmin, ymin), (xmax, ymax) = word["geometry"] - xmin, xmax = int(w * xmin), int(w * xmax) - ymin, ymax = int(h * ymin), int(h * ymax) - - # White drawing context adapted to font size, 0.75 factor to convert pts --> pix - h_box, w_box = ymax - ymin, xmax - xmin - h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75)) - img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255)) - d = ImageDraw.Draw(img) - - # Draw in black the value of the word - d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0)) - - # Resize back to box size - img = img.resize((w_box, h_box), Image.NEAREST) - - # Colorize if draw_proba - if draw_proba: - p = int(255 * word["confidence"]) - mask = np.where(np.array(img) == 0, 1, 0) - proba = np.array([255 - p, 0, p]) - color = mask * proba[np.newaxis, np.newaxis, :] - white_mask = 255 * (1 - mask) - img = color + white_mask - - # Write to response page - response[ymin:ymax, xmin:xmax, :] = np.array(img) - - return response + if interactive: + artists: List[patches.Patch] = [] # instantiate an empty list of patches (to be drawn on the page) + + colors = {k: color for color, k in zip(get_colors(len(page["predictions"])), page["predictions"])} + for key, value in page["predictions"].items(): + for prediction in value: + if not words_only: + rect = create_obj_patch( + prediction["geometry"], + page["dimensions"], + label=f"{key} \n {prediction['value']} (confidence: {prediction['confidence']:.2%}", + color=colors[key], + linewidth=1, + **kwargs, + ) + # add patch on figure + ax.add_patch(rect) + if interactive: + # add patch to cursor's artists + artists.append(rect) + + if interactive: + import mplcursors + + # Create mlp Cursor to hover patches in artists + mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label())) + fig.tight_layout(pad=0.0) + + return fig + + +def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None: + """Draw an array of relative straight boxes on an image + + Args: + ---- + boxes: array of relative boxes, of shape (*, 4) + image: np array, float32 or uint8 + color: color to use for bounding box edges + **kwargs: keyword arguments from `matplotlib.pyplot.plot` + """ + h, w = image.shape[:2] + # Convert boxes to absolute coords + _boxes = deepcopy(boxes) + _boxes[:, [0, 2]] *= w + _boxes[:, [1, 3]] *= h + _boxes = _boxes.astype(np.int32) + for box in _boxes.tolist(): + xmin, ymin, xmax, ymax = box + image = cv2.rectangle( + image, (xmin, ymin), (xmax, ymax), color=color if isinstance(color, tuple) else (0, 0, 255), thickness=2 + ) + plt.imshow(image) + plt.plot(**kwargs)
@@ -540,8 +719,8 @@

Source code for doctr.utils.visualization

       
     
   
- - + + diff --git a/v0.5.0/_modules/index.html b/v0.5.0/_modules/index.html index e86abcd4d4..5793c44f20 100644 --- a/v0.5.0/_modules/index.html +++ b/v0.5.0/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -225,20 +225,42 @@ - - + + diff --git a/v0.5.0/_sources/changelog.rst.txt b/v0.5.0/_sources/changelog.rst.txt index 430097d6c8..35befe7b96 100644 --- a/v0.5.0/_sources/changelog.rst.txt +++ b/v0.5.0/_sources/changelog.rst.txt @@ -1,6 +1,54 @@ Changelog ========= +v0.10.0 (2024-10-21) +------------------- +Release note: `v0.10.0 `_ + +v0.9.0 (2024-08-08) +------------------- +Release note: `v0.9.0 `_ + +v0.8.1 (2024-03-04) +------------------- +Release note: `v0.8.1 `_ + +v0.8.0 (2024-02-28) +------------------- +Release note: `v0.8.0 `_ + +v0.7.0 (2023-09-09) +------------------- +Release note: `v0.7.0 `_ + +v0.6.0 (2022-09-29) +------------------- +Release note: `v0.6.0 `_ + +v0.5.1 (2022-03-22) +------------------- +Release note: `v0.5.1 `_ + +v0.5.0 (2021-12-31) +------------------- +Release note: `v0.5.0 `_ + +v0.4.1 (2021-11-22) +------------------- +Release note: `v0.4.1 `_ + +v0.4.0 (2021-10-01) +------------------- +Release note: `v0.4.0 `_ + +v0.3.1 (2021-08-27) +------------------- +Release note: `v0.3.1 `_ + +v0.3.0 (2021-07-02) +------------------- +Release note: `v0.3.0 `_ + v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.5.0/_sources/datasets.rst.txt b/v0.5.0/_sources/datasets.rst.txt deleted file mode 100644 index 354122f1e5..0000000000 --- a/v0.5.0/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.datasets.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.5.0/_sources/documents.rst.txt b/v0.5.0/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.5.0/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.5.0/_sources/getting_started/installing.rst.txt b/v0.5.0/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/v0.5.0/_sources/getting_started/installing.rst.txt +++ b/v0.5.0/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/v0.5.0/_sources/index.rst.txt b/v0.5.0/_sources/index.rst.txt index fc3ff89fdf..53251db142 100644 --- a/v0.5.0/_sources/index.rst.txt +++ b/v0.5.0/_sources/index.rst.txt @@ -1,7 +1,8 @@ -DocTR: Document Text Recognition -================================ +******************************** +docTR: Document Text Recognition +******************************** -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -9,38 +10,29 @@ State-of-the-art Optical Character Recognition made seamless & accessible to any DocTR provides an easy and powerful way to extract valuable information from your documents: -* |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. +* |:receipt:| **for automation**: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. -Welcome to the documentation of `DocTR `_! - - Main Features ------------- * |:robot:| Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor -* |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract +* |:rocket:| State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, small dependencies -* |:tools:| Daily maintained -* |:factory:| Easy integration - +* |:bird:| Light package, minimal dependencies +* |:tools:| Actively maintained by Mindee +* |:factory:| Easy integration (available templates for browser demo & API deployment) -Getting Started ---------------- .. toctree:: :maxdepth: 2 + :caption: Getting started + :hidden: - installing - - -Build & train your predictor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) -* Fine-tune or train from scratch any detection or recognition model to specialize on your data + getting_started/installing + notebooks Model zoo @@ -48,36 +40,83 @@ Model zoo Text detection models """"""""""""""""""""" - * `DBNet `_ (Differentiable Binarization) - * `LinkNet `_ +* DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ +* LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ +* FAST from `"FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation" `_ Text recognition models """"""""""""""""""""""" - * `SAR `_ (Show, Attend and Read) - * `CRNN `_ (Convolutional Recurrent Neural Network) - * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) +* SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ +* CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ +* MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ +* ViTSTR from `"Vision Transformer for Fast and Efficient Scene Text Recognition" `_ +* PARSeq from `"Scene Text Recognition with Permuted Autoregressive Sequence Models" `_ Supported datasets ^^^^^^^^^^^^^^^^^^ - * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. - * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. - * SROIE from `ICDAR 2019 `_. +* FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. +* CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. +* SROIE from `ICDAR 2019 `_. +* IIIT-5k from `CVIT `_. +* Street View Text from `"End-to-End Scene Text Recognition" `_. +* SynthText from `Visual Geometry Group `_. +* SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. +* IC03 from `ICDAR 2003 `_. +* IC13 from `ICDAR 2013 `_. +* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. +* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" `_. +* IIITHWS from `"Generating Synthetic Data for Text Recognition" `_. +* WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" `_. .. toctree:: :maxdepth: 2 - :caption: Notes + :caption: Using docTR + :hidden: - changelog + using_doctr/using_models + using_doctr/using_datasets + using_doctr/using_contrib_modules + using_doctr/sharing_models + using_doctr/using_model_export + using_doctr/custom_models_training + using_doctr/running_on_aws + + +.. toctree:: + :maxdepth: 2 + :caption: Community + :hidden: + + community/resources .. toctree:: :maxdepth: 2 :caption: Package Reference + :hidden: - datasets - documents - models - transforms - utils + modules/contrib + modules/datasets + modules/io + modules/models + modules/transforms + modules/utils + + +.. toctree:: + :maxdepth: 2 + :caption: Contributing + :hidden: + + contributing/code_of_conduct + contributing/contributing + + +.. toctree:: + :maxdepth: 2 + :caption: Notes + :hidden: + + changelog diff --git a/v0.5.0/_sources/installing.rst.txt b/v0.5.0/_sources/installing.rst.txt deleted file mode 100644 index 5c8779dc1c..0000000000 --- a/v0.5.0/_sources/installing.rst.txt +++ /dev/null @@ -1,46 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: - -* TensorFlow: `installation page `_. -* PyTorch: `installation page `_. - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.5.0/_sources/models.rst.txt b/v0.5.0/_sources/models.rst.txt deleted file mode 100644 index 9830c6c153..0000000000 --- a/v0.5.0/_sources/models.rst.txt +++ /dev/null @@ -1,215 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet16 - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 -.. autofunction:: doctr.models.recognition.master - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.5.0/_sources/transforms.rst.txt b/v0.5.0/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.5.0/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.5.0/_sources/utils.rst.txt b/v0.5.0/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.5.0/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.5.0/_static/basic.css b/v0.5.0/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.5.0/_static/basic.css +++ b/v0.5.0/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.5.0/_static/doctools.js b/v0.5.0/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.5.0/_static/doctools.js +++ b/v0.5.0/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.5.0/_static/documentation_options.js b/v0.5.0/_static/documentation_options.js index a7b5cbe04a..4f656fdbea 100644 --- a/v0.5.0/_static/documentation_options.js +++ b/v0.5.0/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.3.0a0-git', + VERSION: '0.10.1a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.5.0/_static/language_data.js b/v0.5.0/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.5.0/_static/language_data.js +++ b/v0.5.0/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.5.0/_static/searchtools.js b/v0.5.0/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.5.0/_static/searchtools.js +++ b/v0.5.0/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.5.0/changelog.html b/v0.5.0/changelog.html index eafac3a877..fc45a50384 100644 --- a/v0.5.0/changelog.html +++ b/v0.5.0/changelog.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + Changelog - docTR documentation @@ -226,20 +226,42 @@ + diff --git a/v0.5.0/community/resources.html b/v0.5.0/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.5.0/community/resources.html +++ b/v0.5.0/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.5.0/contributing/code_of_conduct.html b/v0.5.0/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/v0.5.0/contributing/code_of_conduct.html +++ b/v0.5.0/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/v0.5.0/contributing/contributing.html b/v0.5.0/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/v0.5.0/contributing/contributing.html +++ b/v0.5.0/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/v0.5.0/datasets.html b/v0.5.0/datasets.html deleted file mode 100644 index 193e576c57..0000000000 --- a/v0.5.0/datasets.html +++ /dev/null @@ -1,578 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
  • sos – optional encoding of Start Of String

  • -
  • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/documents.html b/v0.5.0/documents.html deleted file mode 100644 index 98cbb2c5ef..0000000000 --- a/v0.5.0/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/genindex.html b/v0.5.0/genindex.html index a19b433943..21520455b4 100644 --- a/v0.5.0/genindex.html +++ b/v0.5.0/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -224,20 +224,42 @@

+
+

U

+ + +
+
+

V

@@ -561,7 +711,13 @@

V

W

+
@@ -599,8 +755,8 @@

W

- - + + diff --git a/v0.5.0/getting_started/installing.html b/v0.5.0/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/v0.5.0/getting_started/installing.html +++ b/v0.5.0/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/v0.5.0/index.html b/v0.5.0/index.html index 4c6a28c66a..3a06afc6d9 100644 --- a/v0.5.0/index.html +++ b/v0.5.0/index.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + docTR documentation @@ -226,20 +226,42 @@
-

DocTR: Document Text Recognition

-

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta)

+

docTR: Document Text Recognition

+

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch

https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png

DocTR provides an easy and powerful way to extract valuable information from your documents:

    -
  • 🧾 for automation: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • +
  • 🧾 for automation: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • 👩‍🔬 for research: quickly compare your own architectures speed & performances with state-of-art models on public datasets.

-

Welcome to the documentation of DocTR!

Main Features

  • 🤖 Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters

  • ⚡ User-friendly, 3 lines of code to load a document and extract text with a predictor

  • -
  • 🚀 State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract

  • +
  • 🚀 State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract

  • ⚡ Optimized for inference speed on both CPU & GPU

  • -
  • 🐦 Light package, small dependencies

  • -
  • 🛠️ Daily maintained

  • -
  • 🏭 Easy integration

  • +
  • 🐦 Light package, minimal dependencies

  • +
  • 🛠️ Actively maintained by Mindee

  • +
  • 🏭 Easy integration (available templates for browser demo & API deployment)

-
-
-

Getting Started

-
-

Build & train your predictor

-
    -
  • Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained)

  • -
  • Fine-tune or train from scratch any detection or recognition model to specialize on your data

  • -
-

Model zoo

Text detection models

-
-

Text recognition models

-
-

Supported datasets

-
-
+
+
+
+
+
@@ -406,7 +381,7 @@

Supported datasets - +
Next @@ -446,10 +421,8 @@

Supported datasets + diff --git a/v0.5.0/installing.html b/v0.5.0/installing.html deleted file mode 100644 index b61c60134b..0000000000 --- a/v0.5.0/installing.html +++ /dev/null @@ -1,395 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

- -

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/models.html b/v0.5.0/models.html deleted file mode 100644 index b5cd44c9fa..0000000000 --- a/v0.5.0/models.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet16
->>> model = linknet16(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
-

MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. -Example:

-
>>> import tensorflow as tf
->>> from doctr.models import master
->>> model = master(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

59.50

62.50

75.30

70.00

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

66.00

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/modules/contrib.html b/v0.5.0/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.5.0/modules/contrib.html +++ b/v0.5.0/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.5.0/modules/datasets.html b/v0.5.0/modules/datasets.html index 456e10b172..380a986793 100644 --- a/v0.5.0/modules/datasets.html +++ b/v0.5.0/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1081,7 +1081,7 @@

Returns:

- + diff --git a/v0.5.0/modules/io.html b/v0.5.0/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/v0.5.0/modules/io.html +++ b/v0.5.0/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/v0.5.0/modules/models.html b/v0.5.0/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/v0.5.0/modules/models.html +++ b/v0.5.0/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/v0.5.0/modules/transforms.html b/v0.5.0/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/v0.5.0/modules/transforms.html +++ b/v0.5.0/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/v0.5.0/modules/utils.html b/v0.5.0/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/v0.5.0/modules/utils.html +++ b/v0.5.0/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/v0.5.0/notebooks.html b/v0.5.0/notebooks.html index f97771aebb..d36539f59e 100644 --- a/v0.5.0/notebooks.html +++ b/v0.5.0/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/v0.5.0/objects.inv b/v0.5.0/objects.inv index a22d2ce821..c1700f291b 100644 Binary files a/v0.5.0/objects.inv and b/v0.5.0/objects.inv differ diff --git a/v0.5.0/py-modindex.html b/v0.5.0/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.5.0/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.5.0/search.html b/v0.5.0/search.html index 73772822d2..d050f5eac7 100644 --- a/v0.5.0/search.html +++ b/v0.5.0/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -226,20 +226,42 @@ - - + + diff --git a/v0.5.0/searchindex.js b/v0.5.0/searchindex.js index 803f4f4bcf..6f154115ab 100644 --- a/v0.5.0/searchindex.js +++ b/v0.5.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": 5, "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": 5, "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "2023": [], "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": 3, "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": [], "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "andrej": [], "angl": 2, "ani": [1, 2, 3, 5, 6, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": [], "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "articl": [], "artifici": [], "arxiv": 5, "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": [3, 6], "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "baranovskij": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 3, "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": 2, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [1, 2, 6, 7], "box": [1, 2, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "christian": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": [], "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "cool": [], "coordin": 2, "cord": [1, 3, 5], "core": 7, "corner": [], "correct": 6, "correspond": [4, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": 4, "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "found": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gallagh": [], "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ibrahimov": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": [1, 2], "insult": [], "int": [1, 2, 5, 6, 7], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "jame": [], "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": 4, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [3, 5], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "moscardi": [], "most": 5, "mozilla": [], "multi": 3, "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": [], "netraj": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 3, 6, 7], "none": [1, 2, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 3, "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": 1, "order": [1, 2, 5], "org": 5, "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 5, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [4, 5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "parallel": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "patil": [], "pattern": [], "pdf": [2, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [3, 4], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": [], "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "realli": [], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": 4, "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "roboflow": [], "robust": 3, "root": 1, "rotat": [1, 2], "rotated_bbox": [1, 7], "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sanjin": [], "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": [3, 5], "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [1, 4], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standalon": [], "standard": 6, "start": 1, "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": [], "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unit": [], "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "verma": [], "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "video": [], "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": 4, "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "yugesh": [], "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "21": [], "22": [], "27": [], "28": 0, "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": [], "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "resourc": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/v0.5.0/transforms.html b/v0.5.0/transforms.html deleted file mode 100644 index 85e94d8a76..0000000000 --- a/v0.5.0/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.5)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[Callable[[Any], Any]])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[Callable[[Any], Any]])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: Callable[[Any], Any], p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.0/using_doctr/custom_models_training.html b/v0.5.0/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/v0.5.0/using_doctr/custom_models_training.html +++ b/v0.5.0/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/v0.5.0/using_doctr/running_on_aws.html b/v0.5.0/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/v0.5.0/using_doctr/running_on_aws.html +++ b/v0.5.0/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/v0.5.0/using_doctr/sharing_models.html b/v0.5.0/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/v0.5.0/using_doctr/sharing_models.html +++ b/v0.5.0/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/v0.5.0/using_doctr/using_contrib_modules.html b/v0.5.0/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.5.0/using_doctr/using_contrib_modules.html +++ b/v0.5.0/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.5.0/using_doctr/using_datasets.html b/v0.5.0/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/v0.5.0/using_doctr/using_datasets.html +++ b/v0.5.0/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/v0.5.0/using_doctr/using_model_export.html b/v0.5.0/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/v0.5.0/using_doctr/using_model_export.html +++ b/v0.5.0/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/v0.5.0/using_doctr/using_models.html b/v0.5.0/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/v0.5.0/using_doctr/using_models.html +++ b/v0.5.0/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/v0.5.0/utils.html b/v0.5.0/utils.html deleted file mode 100644 index e2f223f06a..0000000000 --- a/v0.5.0/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float | None, float | None, float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/datasets/cord.html b/v0.5.1/_modules/doctr/datasets/cord.html index f98ee6901c..55b0584830 100644 --- a/v0.5.1/_modules/doctr/datasets/cord.html +++ b/v0.5.1/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.cord

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
-from doctr.utils.geometry import fit_rbbox
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['CORD']
+__all__ = ["CORD"]
 
 
 
-[docs] +[docs] class CORD(VisionDataset): """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" <https://openreview.net/pdf?id=SJl3z659UH>`_. - Example:: - >>> from doctr.datasets import CORD - >>> train_set = CORD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0 + :align: center + + >>> from doctr.datasets import CORD + >>> train_set = CORD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip', - '45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_test.zip', - '8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0", + "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8", + "cord_train.zip", + ) + + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0", + "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58", + "cord_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - - # # List images - self.root = os.path.join(self._root, 'image') - self.data: List[Tuple[str, Dict[str, Any]]] = [] + # List images + tmp_root = os.path.join(self.root, "image") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] self.train = train - self.sample_transforms = sample_transforms - for img_path in os.listdir(self.root): + np_dtype = np.float32 + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem _targets = [] - with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f: label = json.load(f) for line in label["valid_line"]: for word in line["words"]: if len(word["text"]) > 0: x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] - if rotated_bbox: - box = list(fit_rbbox(np.array([ - [x[0], y[0]], - [x[1], y[1]], - [x[2], y[2]], - [x[3], y[3]], - ], dtype=np.float32))) + box: Union[List[float], np.ndarray] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + box = np.array( + [ + [x[0], y[0]], + [x[1], y[1]], + [x[2], y[2]], + [x[3], y[3]], + ], + dtype=np_dtype, + ) else: - # Reduce 8 coords to 4 + # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax box = [min(x), min(y), max(x), max(y)] - _targets.append((word['text'], box)) + _targets.append((word["text"], box)) text_targets, box_targets = zip(*_targets) - self.data.append(( - img_path, - dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets) - )) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) + ) + for crop, label in zip(crops, list(text_targets)): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -397,8 +461,8 @@

Source code for doctr.datasets.cord

       
     
   
-
- + + diff --git a/v0.5.1/_modules/doctr/datasets/core.html b/v0.5.1/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.5.1/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/datasets/datasets/tensorflow.html b/v0.5.1/_modules/doctr/datasets/datasets/tensorflow.html deleted file mode 100644 index a236abd9fe..0000000000 --- a/v0.5.1/_modules/doctr/datasets/datasets/tensorflow.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - - - - - doctr.datasets.datasets.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.datasets.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from typing import List, Any, Tuple
-import tensorflow as tf
-
-from .base import _AbstractDataset, _VisionDataset
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset(_AbstractDataset):
-
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-
-        return img, target
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset, _VisionDataset): - pass
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/datasets/detection.html b/v0.5.1/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/v0.5.1/_modules/doctr/datasets/detection.html +++ b/v0.5.1/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.5.1/_modules/doctr/datasets/doc_artefacts.html b/v0.5.1/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/v0.5.1/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.5.1/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.5.1/_modules/doctr/datasets/funsd.html b/v0.5.1/_modules/doctr/datasets/funsd.html index 35d7ad4cf5..f08612f9fa 100644 --- a/v0.5.1/_modules/doctr/datasets/funsd.html +++ b/v0.5.1/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.funsd

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['FUNSD']
+__all__ = ["FUNSD"]
 
 
 
-[docs] +[docs] class FUNSD(VisionDataset): """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. - Example:: - >>> from doctr.datasets import FUNSD - >>> train_set = FUNSD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0 + :align: center + + >>> from doctr.datasets import FUNSD + >>> train_set = FUNSD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip' - SHA256 = 'c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f' - FILE_NAME = 'funsd.zip' + URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip" + SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f" + FILE_NAME = "funsd.zip" def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + super().__init__( + self.URL, + self.FILE_NAME, + self.SHA256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - super().__init__(self.URL, self.FILE_NAME, self.SHA256, True, **kwargs) self.train = train - self.sample_transforms = sample_transforms + np_dtype = np.float32 # Use the subset - subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data') + subfolder = os.path.join("dataset", "training_data" if train else "testing_data") # # List images - self.root = os.path.join(self._root, subfolder, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + tmp_root = os.path.join(self.root, subfolder, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f: data = json.load(f) - _targets = [(word['text'], word['box']) for block in data['form'] - for word in block['words'] if len(word['text']) > 0] + _targets = [ + (word["text"], word["box"]) + for block in data["form"] + for word in block["words"] + if len(word["text"]) > 0 + ] text_targets, box_targets = zip(*_targets) - if rotated_bbox: - # box_targets: xmin, ymin, xmax, ymax -> x, y, w, h, alpha = 0 - box_targets = [ + if use_polygons: + # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners + box_targets = [ # type: ignore[assignment] [ - (box[0] + box[2]) / 2, (box[1] + box[3]) / 2, box[2] - box[0], box[3] - box[1], 0 - ] for box in box_targets + [box[0], box[1]], + [box[2], box[1]], + [box[2], box[3]], + [box[0], box[3]], + ] + for box in box_targets ] - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=int), labels=text_targets))) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype) + ) + for crop, label in zip(crops, list(text_targets)): + # filter labels with unknown characters + if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -386,8 +453,8 @@

Source code for doctr.datasets.funsd

       
     
   
-
- + + diff --git a/v0.5.1/_modules/doctr/datasets/generator/tensorflow.html b/v0.5.1/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/v0.5.1/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.5.1/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.5.1/_modules/doctr/datasets/ic03.html b/v0.5.1/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/v0.5.1/_modules/doctr/datasets/ic03.html +++ b/v0.5.1/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.5.1/_modules/doctr/datasets/ic13.html b/v0.5.1/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/v0.5.1/_modules/doctr/datasets/ic13.html +++ b/v0.5.1/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.5.1/_modules/doctr/datasets/iiit5k.html b/v0.5.1/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/v0.5.1/_modules/doctr/datasets/iiit5k.html +++ b/v0.5.1/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.5.1/_modules/doctr/datasets/iiithws.html b/v0.5.1/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/v0.5.1/_modules/doctr/datasets/iiithws.html +++ b/v0.5.1/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.5.1/_modules/doctr/datasets/imgur5k.html b/v0.5.1/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/v0.5.1/_modules/doctr/datasets/imgur5k.html +++ b/v0.5.1/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.5.1/_modules/doctr/datasets/loader.html b/v0.5.1/_modules/doctr/datasets/loader.html index d32e6da298..ed80350ef0 100644 --- a/v0.5.1/_modules/doctr/datasets/loader.html +++ b/v0.5.1/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.loader

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import math
-import tensorflow as tf
-import numpy as np
-from typing import Optional
+from typing import Callable, Optional
 
-from .multithreading import multithread_exec
+import numpy as np
+import tensorflow as tf
 
 __all__ = ["DataLoader"]
 
@@ -293,12 +314,13 @@ 

Source code for doctr.datasets.loader

     """Collate multiple elements into batches
 
     Args:
+    ----
         samples: list of N tuples containing M elements
 
     Returns:
+    -------
         Tuple of M sequences contianing N elements each
     """
-
     batch_data = zip(*samples)
 
     tf_data = tuple(tf.stack(elt, axis=0) for elt in batch_data)
@@ -307,23 +329,23 @@ 

Source code for doctr.datasets.loader

 
 
 
-[docs] +[docs] class DataLoader: """Implements a dataset wrapper for fast data loading - Example:: - >>> from doctr.datasets import FUNSD, DataLoader - >>> train_set = CORD(train=True, download=True) - >>> train_loader = DataLoader(train_set, batch_size=32) - >>> train_iter = iter(train_loader) - >>> images, targets = next(train_iter) + >>> from doctr.datasets import CORD, DataLoader + >>> train_set = CORD(train=True, download=True) + >>> train_loader = DataLoader(train_set, batch_size=32) + >>> train_iter = iter(train_loader) + >>> images, targets = next(train_iter) Args: + ---- dataset: the dataset shuffle: whether the samples should be shuffled before passing it to the iterator batch_size: number of elements in each batch drop_last: if `True`, drops the last batch if it isn't full - workers: number of workers to use for data loading + collate_fn: function to merge samples into a batch """ def __init__( @@ -332,17 +354,22 @@

Source code for doctr.datasets.loader

         shuffle: bool = True,
         batch_size: int = 1,
         drop_last: bool = False,
-        workers: Optional[int] = None,
+        collate_fn: Optional[Callable] = None,
     ) -> None:
         self.dataset = dataset
         self.shuffle = shuffle
         self.batch_size = batch_size
         nb = len(self.dataset) / batch_size
         self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
-        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
-        self.workers = workers
+        if collate_fn is None:
+            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
+        else:
+            self.collate_fn = collate_fn
         self.reset()
 
+    def __len__(self) -> int:
+        return self.num_batches
+
     def reset(self) -> None:
         # Updates indices after each epoch
         self._num_yielded = 0
@@ -358,9 +385,9 @@ 

Source code for doctr.datasets.loader

         if self._num_yielded < self.num_batches:
             # Get next indices
             idx = self._num_yielded * self.batch_size
-            indices = self.indices[idx: min(len(self.dataset), idx + self.batch_size)]
+            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
 
-            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.workers)
+            samples = list(map(self.dataset.__getitem__, indices))
 
             batch_data = self.collate_fn(samples)
 
@@ -401,8 +428,8 @@ 

Source code for doctr.datasets.loader

       
     
   
-
- +
+ diff --git a/v0.5.1/_modules/doctr/datasets/mjsynth.html b/v0.5.1/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/v0.5.1/_modules/doctr/datasets/mjsynth.html +++ b/v0.5.1/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.5.1/_modules/doctr/datasets/ocr.html b/v0.5.1/_modules/doctr/datasets/ocr.html index 11297d5952..ce1ed8b0d4 100644 --- a/v0.5.1/_modules/doctr/datasets/ocr.html +++ b/v0.5.1/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.ocr

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple
 
-from .datasets import AbstractDataset
-from doctr.utils.geometry import fit_rbbox
+import numpy as np
 
+from .datasets import AbstractDataset
 
-__all__ = ['OCRDataset']
+__all__ = ["OCRDataset"]
 
 
 
-[docs] +[docs] class OCRDataset(AbstractDataset): """Implements an OCR dataset + >>> from doctr.datasets import OCRDataset + >>> train_set = OCRDataset(img_folder="/path/to/images", + >>> label_file="/path/to/labels.json") + >>> img, target = train_set[0] + Args: + ---- img_folder: local path to image folder (all jpg at the root) label_file: local path to the label file - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) - **kwargs: keyword arguments from `VisionDataset`. + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + **kwargs: keyword arguments from `AbstractDataset`. """ def __init__( self, img_folder: str, label_file: str, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, **kwargs: Any, ) -> None: - - self.sample_transforms = sample_transforms - self.root = img_folder + super().__init__(img_folder, **kwargs) # List images self.data: List[Tuple[str, Dict[str, Any]]] = [] - with open(label_file, 'rb') as f: + np_dtype = np.float32 + with open(label_file, "rb") as f: data = json.load(f) - for file_dic in data: + for img_name, annotations in data.items(): # Get image path - img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg' + img_name = Path(img_name) # File existence check if not os.path.exists(os.path.join(self.root, img_name)): raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") # handle empty images - if (len(file_dic["coordinates"]) == 0 or - (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")): - self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[]))) + if len(annotations["typed_words"]) == 0: + self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[]))) continue - is_valid: List[bool] = [] - box_targets: List[List[float]] = [] - for box in file_dic["coordinates"]: - if rotated_bbox: - x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32)) - box = [x, y, w, h, alpha] - is_valid.append(w > 0 and h > 0) - else: - xs, ys = zip(*box) - box = [min(xs), min(ys), max(xs), max(ys)] - is_valid.append(box[0] < box[2] and box[1] < box[3]) - if is_valid[-1]: - box_targets.append(box) + # Unpack the straight boxes (xmin, ymin, xmax, ymax) + geoms = [list(map(float, obj["geometry"][:4])) for obj in annotations["typed_words"]] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + geoms = [ + [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]] # type: ignore[list-item] + for geom in geoms + ] + + text_targets = [obj["value"] for obj in annotations["typed_words"]] - text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid] - self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
+ self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))
@@ -383,8 +402,8 @@

Source code for doctr.datasets.ocr

       
     
   
- - + + diff --git a/v0.5.1/_modules/doctr/datasets/recognition.html b/v0.5.1/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/v0.5.1/_modules/doctr/datasets/recognition.html +++ b/v0.5.1/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.5.1/_modules/doctr/datasets/sroie.html b/v0.5.1/_modules/doctr/datasets/sroie.html index 66fd4ca3e0..04cf10bda2 100644 --- a/v0.5.1/_modules/doctr/datasets/sroie.html +++ b/v0.5.1/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.sroie

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import csv
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['SROIE']
+__all__ = ["SROIE"]
 
 
 
-[docs] +[docs] class SROIE(VisionDataset): """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction" <https://arxiv.org/pdf/2103.10213.pdf>`_. - Example:: - >>> from doctr.datasets import SROIE - >>> train_set = SROIE(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0 + :align: center + + >>> from doctr.datasets import SROIE + >>> train_set = SROIE(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_train_task1.zip', - 'd4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_test.zip', - '41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0", + "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f", + "sroie2019_train_task1.zip", + ) + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0", + "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2", + "sroie2019_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - self.sample_transforms = sample_transforms self.train = train - if rotated_bbox: - raise NotImplementedError + tmp_root = os.path.join(self.root, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + np_dtype = np.float32 - # # List images - self.root = os.path.join(self._root, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - _targets = [] - with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f: - for row in csv.reader(f, delimiter=','): - # Safeguard for blank lines - if len(row) > 0: - # Label may contain commas - label = ",".join(row[8:]) - # Reduce 8 coords to 4 - p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x, p4_y = map(int, row[:8]) - left, right = min(p1_x, p2_x, p3_x, p4_x), max(p1_x, p2_x, p3_x, p4_x) - top, bot = min(p1_y, p2_y, p3_y, p4_y), max(p1_y, p2_y, p3_y, p4_y) - if len(label) > 0: - _targets.append((label, [left, top, right, bot])) - - text_targets, box_targets = zip(*_targets) - - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets))) + with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f: + _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0] + + labels = [",".join(row[8:]) for row in _rows] + # reorder coordinates (8 -> (4,2) -> + # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines + coords: np.ndarray = np.stack( + [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0 + ) + + if not use_polygons: + # xmin, ymin, xmax, ymax + coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1) + + if recognition_task: + crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords) + for crop, label in zip(crops, labels): + if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0: + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, coords)) + else: + self.data.append((img_path, dict(boxes=coords, labels=labels))) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -390,8 +444,8 @@

Source code for doctr.datasets.sroie

       
     
   
-
- + + diff --git a/v0.5.1/_modules/doctr/datasets/svhn.html b/v0.5.1/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/v0.5.1/_modules/doctr/datasets/svhn.html +++ b/v0.5.1/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.5.1/_modules/doctr/datasets/svt.html b/v0.5.1/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/v0.5.1/_modules/doctr/datasets/svt.html +++ b/v0.5.1/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.5.1/_modules/doctr/datasets/synthtext.html b/v0.5.1/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/v0.5.1/_modules/doctr/datasets/synthtext.html +++ b/v0.5.1/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.5.1/_modules/doctr/datasets/utils.html b/v0.5.1/_modules/doctr/datasets/utils.html index 2259698c0f..bde9304597 100644 --- a/v0.5.1/_modules/doctr/datasets/utils.html +++ b/v0.5.1/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.utils

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import string
 import unicodedata
+from collections.abc import Sequence
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Sequence as SequenceType
+
 import numpy as np
-from typing import List, Optional, Any
+from PIL import Image
+
+from doctr.io.image import get_img_shape
+from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
 
 from .vocabs import VOCABS
 
-__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
+__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences", "pre_transform_multiclass"]
+
+ImageTensor = TypeVar("ImageTensor")
 
 
 def translate(
     input_string: str,
     vocab_name: str,
-    unknown_char: str = '■',
+    unknown_char: str = "■",
 ) -> str:
     """Translate a string input in a given vocabulary
 
     Args:
+    ----
         input_string: input string to translate
         vocab_name: vocabulary to use (french, latin, ...)
         unknown_char: unknown character for non-translatable characters
 
     Returns:
-        A string translated in a given vocab"""
-
+    -------
+        A string translated in a given vocab
+    """
     if VOCABS.get(vocab_name) is None:
         raise KeyError("output vocabulary must be in vocabs dictionnary")
 
-    translated = ''
+    translated = ""
     for char in input_string:
         if char not in VOCABS[vocab_name]:
             # we need to translate char into a vocab char
@@ -315,51 +350,63 @@ 

Source code for doctr.datasets.utils

                 # remove whitespaces
                 continue
             # normalize character if it is not in vocab
-            char = unicodedata.normalize('NFD', char).encode('ascii', 'ignore').decode('ascii')
-            if char == '' or char not in VOCABS[vocab_name]:
+            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
+            if char == "" or char not in VOCABS[vocab_name]:
                 # if normalization fails or char still not in vocab, return unknown character)
                 char = unknown_char
         translated += char
     return translated
 
 
-def encode_sequence(
+def encode_string(
     input_string: str,
     vocab: str,
 ) -> List[int]:
     """Given a predefined mapping, encode the string to a sequence of numbers
 
     Args:
+    ----
         input_string: string to encode
         vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A list encoding the input_string"""
-
-    return list(map(vocab.index, input_string))  # type: ignore[arg-type]
+    -------
+        A list encoding the input_string
+    """
+    try:
+        return list(map(vocab.index, input_string))
+    except ValueError:
+        raise ValueError(
+            f"some characters cannot be found in 'vocab'. \
+                         Please check the input string {input_string} and the vocabulary {vocab}"
+        )
 
 
 def decode_sequence(
-    input_array: np.array,
+    input_seq: Union[np.ndarray, SequenceType[int]],
     mapping: str,
 ) -> str:
     """Given a predefined mapping, decode the sequence of numbers to a string
 
     Args:
-        input_array: array to decode
+    ----
+        input_seq: array to decode
         mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A string, decoded from input_array"""
-
-    if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
+    -------
+        A string, decoded from input_seq
+    """
+    if not isinstance(input_seq, (Sequence, np.ndarray)):
+        raise TypeError("Invalid sequence type")
+    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
         raise AssertionError("Input must be an array of int, with max less than mapping size")
-    decoded = ''.join(mapping[idx] for idx in input_array)
-    return decoded
+
+    return "".join(map(mapping.__getitem__, input_seq))
 
 
 
-[docs] +[docs] def encode_sequences( sequences: List[str], vocab: str, @@ -367,48 +414,53 @@

Source code for doctr.datasets.utils

     eos: int = -1,
     sos: Optional[int] = None,
     pad: Optional[int] = None,
-    **kwargs: Any,
+    dynamic_seq_length: bool = False,
 ) -> np.ndarray:
     """Encode character sequences using a given vocab as mapping
 
     Args:
+    ----
         sequences: the list of character sequences of size N
         vocab: the ordered vocab to use for encoding
         target_size: maximum length of the encoded data
         eos: encoding of End Of String
         sos: optional encoding of Start Of String
         pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
+        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
 
     Returns:
+    -------
         the padded encoded data as a tensor
     """
-
     if 0 <= eos < len(vocab):
         raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
 
-    if not isinstance(target_size, int):
-        target_size = max(len(w) for w in sequences)
-        if sos:
-            target_size += 1
-        if pad:
-            target_size += 1
+    if not isinstance(target_size, int) or dynamic_seq_length:
+        # Maximum string length + EOS
+        max_length = max(len(w) for w in sequences) + 1
+        if isinstance(sos, int):
+            max_length += 1
+        if isinstance(pad, int):
+            max_length += 1
+        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
 
     # Pad all sequences
-    if pad:  # pad with padding symbol
+    if isinstance(pad, int):  # pad with padding symbol
         if 0 <= pad < len(vocab):
             raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
         # In that case, add EOS at the end of the word before padding
-        encoded_data = np.full([len(sequences), target_size], pad, dtype=np.int32)
+        default_symbol = pad
     else:  # pad with eos symbol
-        encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32)
+        default_symbol = eos
+    encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
 
-    for idx, seq in enumerate(sequences):
-        encoded_seq = encode_sequence(seq, vocab)
-        if pad:  # add eos at the end of the sequence
-            encoded_seq.append(eos)
-        encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)]
+    # Encode the strings
+    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
+        if isinstance(pad, int):  # add eos at the end of the sequence
+            seq.append(eos)
+        encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)]
 
-    if sos:  # place eos symbol at the beginning of each sequence
+    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
         if 0 <= sos < len(vocab):
             raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
         encoded_data = np.roll(encoded_data, 1)
@@ -416,6 +468,59 @@ 

Source code for doctr.datasets.utils

 
     return encoded_data
+ + +def convert_target_to_relative( + img: ImageTensor, target: Union[np.ndarray, Dict[str, Any]] +) -> Tuple[ImageTensor, Union[Dict[str, Any], np.ndarray]]: + if isinstance(target, np.ndarray): + target = convert_to_relative_coords(target, get_img_shape(img)) + else: + target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img)) + return img, target + + +def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]: + """Crop a set of bounding boxes from an image + + Args: + ---- + img_path: path to the image + geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4) + + Returns: + ------- + a list of cropped images + """ + with Image.open(img_path) as pil_img: + img: np.ndarray = np.array(pil_img.convert("RGB")) + # Polygon + if geoms.ndim == 3 and geoms.shape[1:] == (4, 2): + return extract_rcrops(img, geoms.astype(dtype=int)) + if geoms.ndim == 2 and geoms.shape[1] == 4: + return extract_crops(img, geoms.astype(dtype=int)) + raise ValueError("Invalid geometry format") + + +def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.ndarray, Dict[str, List]]: + """Converts multiclass target to relative coordinates. + + Args: + ---- + img: Image + target: tuple of target polygons and their classes names + + Returns: + ------- + Image and dictionary of boxes, with class names as keys + """ + boxes = convert_to_relative_coords(target[0], get_img_shape(img)) + boxes_classes = target[1] + boxes_dict: Dict = {k: [] for k in sorted(set(boxes_classes))} + for k, poly in zip(boxes_classes, boxes): + boxes_dict[k].append(poly) + boxes_dict = {k: np.stack(v, axis=0) for k, v in boxes_dict.items()} + return img, boxes_dict
@@ -448,8 +553,8 @@

Source code for doctr.datasets.utils

       
     
   
- - + + diff --git a/v0.5.1/_modules/doctr/datasets/wildreceipt.html b/v0.5.1/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.5.1/_modules/doctr/datasets/wildreceipt.html +++ b/v0.5.1/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.5.1/_modules/doctr/documents/elements.html b/v0.5.1/_modules/doctr/documents/elements.html deleted file mode 100644 index 10c1e142d2..0000000000 --- a/v0.5.1/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,577 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox, RotatedBbox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator, arg-type] - - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/documents/reader.html b/v0.5.1/_modules/doctr/documents/reader.html deleted file mode 100644 index cdcd814b6c..0000000000 --- a/v0.5.1/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,612 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence, Dict
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/io/elements.html b/v0.5.1/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/v0.5.1/_modules/doctr/io/elements.html +++ b/v0.5.1/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.5.1/_modules/doctr/io/html.html b/v0.5.1/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/v0.5.1/_modules/doctr/io/html.html +++ b/v0.5.1/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.5.1/_modules/doctr/io/image/base.html b/v0.5.1/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/v0.5.1/_modules/doctr/io/image/base.html +++ b/v0.5.1/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.5.1/_modules/doctr/io/image/tensorflow.html b/v0.5.1/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/v0.5.1/_modules/doctr/io/image/tensorflow.html +++ b/v0.5.1/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.5.1/_modules/doctr/io/pdf.html b/v0.5.1/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/v0.5.1/_modules/doctr/io/pdf.html +++ b/v0.5.1/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.5.1/_modules/doctr/io/reader.html b/v0.5.1/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/v0.5.1/_modules/doctr/io/reader.html +++ b/v0.5.1/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.5.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/v0.5.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.5.1/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/v0.5.1/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.5.1/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/v0.5.1/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.5.1/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.5.1/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.5.1/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/v0.5.1/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.5.1/_modules/doctr/models/classification/vit/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/v0.5.1/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.5.1/_modules/doctr/models/classification/zoo.html b/v0.5.1/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/v0.5.1/_modules/doctr/models/classification/zoo.html +++ b/v0.5.1/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.5.1/_modules/doctr/models/detection/differentiable_binarization.html b/v0.5.1/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.5.1/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.5.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 9145c7c3fd..66cef8663d 100644 --- a/v0.5.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import List, Tuple, Optional, Any, Dict
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+from tensorflow.keras.applications import ResNet50
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
+
+from ...classification import mobilenet_v3_large
 from .base import DBPostProcessor, _DBNet
 
-__all__ = ['DBNet', 'db_resnet50']
+__all__ = ["DBNet", "db_resnet50", "db_mobilenet_v3_large"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
+    "db_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_resnet50-649fa22b.weights.h5&src=0",
+    },
+    "db_mobilenet_v3_large": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_mobilenet_v3_large-ee2e1dbe.weights.h5&src=0",
     },
 }
 
@@ -313,6 +348,7 @@ 

Source code for doctr.models.detection.differentiable_binarization.tensorflo <https://arxiv.org/pdf/1612.03144.pdf>`_. Args: + ---- channels: number of channel to output """ @@ -322,9 +358,9 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo ) -> None: super().__init__() self.channels = channels - self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest') - self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)] - self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)] + self.upsample = layers.UpSampling2D(size=(2, 2), interpolation="nearest") + self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer="he_normal") for _ in range(4)] + self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2**idx) for idx in range(4)] @staticmethod def build_upsampling( @@ -334,20 +370,21 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo """Module which performs a 3x3 convolution followed by up-sampling Args: + ---- channels: number of output channels dilation_factor (int): dilation factor to scale the convolution output before concatenation Returns: + ------- a keras.layers.Layer object, wrapping these operations in a sequential module """ - - _layers = conv_sequence(channels, 'relu', True, kernel_size=3) + _layers = conv_sequence(channels, "relu", True, kernel_size=3) if dilation_factor > 1: - _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest')) + _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation="nearest")) - module = keras.Sequential(_layers) + module = Sequential(_layers) return module @@ -359,7 +396,6 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo x: List[tf.Tensor], **kwargs: Any, ) -> tf.Tensor: - # Channel mapping results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)] # Upsample & sum @@ -371,200 +407,324 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo return layers.concatenate(results) -class DBNet(_DBNet, keras.Model, NestedObject): +class DBNet(_DBNet, Model, NestedObject): """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_. Args: + ---- feature extractor: the backbone serving as feature extractor fpn_channels: number of channels each extracted feature maps is mapped to + bin_thresh: threshold for binarization + box_thresh: minimal objectness score to consider a box + assume_straight_pages: if True, fit straight bounding boxes only + exportable: onnx exportable returns only logits + cfg: the configuration dict of the model + class_names: list of class names """ - _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "fpn", "probability_head", "threshold_head", "postprocessor"] def __init__( self, feature_extractor: IntermediateLayerGetter, - fpn_channels: int = 128, - rotated_bbox: bool = False, + fpn_channels: int = 128, # to be set to 256 to represent the author's initial idea + bin_thresh: float = 0.3, + box_thresh: float = 0.1, + assume_straight_pages: bool = True, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, + class_names: List[str] = [CLASS_NAME], ) -> None: - super().__init__() + self.class_names = class_names + num_classes: int = len(self.class_names) self.cfg = cfg self.feat_extractor = feature_extractor - self.rotated_bbox = rotated_bbox + self.exportable = exportable + self.assume_straight_pages = assume_straight_pages self.fpn = FeaturePyramidNetwork(channels=fpn_channels) # Initialize kernels _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape] output_shape = tuple(self.fpn(_inputs).shape) - self.probability_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] + self.probability_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + self.threshold_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + + self.postprocessor = DBPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh ) - self.threshold_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] - ) - - self.postprocessor = DBPostProcessor(rotated_bbox=rotated_bbox) def compute_loss( self, out_map: tf.Tensor, thresh_map: tf.Tensor, - target: List[Dict[str, Any]] + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes and a list of masks for each image. From there it computes the loss with the model output Args: + ---- out_map: output feature map of the model of shape (N, H, W, C) thresh_map: threshold map of shape (N, H, W, C) target: list of dictionary where each dict has a `boxes` and a `flags` entry + gamma: modulating factor in the focal loss formula + alpha: balancing factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") - prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1])) - thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1])) + prob_map = tf.math.sigmoid(out_map) + thresh_map = tf.math.sigmoid(thresh_map) - seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) + seg_target, seg_mask, thresh_target, thresh_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32) + seg_mask = tf.cast(seg_mask, tf.float32) + thresh_target = tf.convert_to_tensor(thresh_target, dtype=out_map.dtype) thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool) - # Compute balanced BCE loss for proba_map - bce_scale = 5. - bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask] - - neg_target = 1 - seg_target[seg_mask] - positive_count = tf.math.reduce_sum(seg_target[seg_mask]) - negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count]) - negative_loss = bce_loss * neg_target - negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32)) - sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss) - balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6) - - # Compute dice loss for approxbin_map - bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask]))) - - bce_min = tf.math.reduce_min(bce_loss) - weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1. - inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights) - union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8 - dice_loss = 1 - 2.0 * inter / union + # Focal loss + focal_scale = 10.0 + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + + # Convert logits to prob, compute gamma factor + p_t = (seg_target * prob_map) + ((1 - seg_target) * (1 - prob_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class or for approx binary_map + if len(self.class_names) > 1: + dice_map = tf.nn.softmax(out_map, axis=-1) + else: + # compute binary map instead + dice_map = 1.0 / (1.0 + tf.exp(-50 * (prob_map - thresh_map))) + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) # Compute l1 loss for thresh_map - l1_scale = 10. if tf.reduce_any(thresh_mask): - l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask])) + thresh_mask = tf.cast(thresh_mask, tf.float32) + l1_loss = tf.reduce_sum(tf.abs(thresh_map - thresh_target) * thresh_mask) / ( + tf.reduce_sum(thresh_mask) + eps + ) else: - l1_loss = tf.constant(0.) + l1_loss = tf.constant(0.0) - return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss + return l1_loss + focal_scale * focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - feat_maps = self.feat_extractor(x, **kwargs) feat_concat = self.fpn(feat_maps, **kwargs) logits = self.probability_head(feat_concat, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: - # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + if target is None or return_preds: + # Post-process boxes (keep only text predictions) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: thresh_map = self.threshold_head(feat_concat, **kwargs) loss = self.compute_loss(logits, thresh_map, target) - out['loss'] = loss + out["loss"] = loss return out -def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet: +def _db_resnet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) # Feature extractor - resnet = tf.keras.applications.__dict__[_cfg['backbone']]( - include_top=False, - weights=None, - input_shape=_cfg['input_shape'], - pooling=None, + feat_extractor = IntermediateLayerGetter( + backbone_fn( + weights="imagenet" if pretrained_backbone else None, + include_top=False, + pooling=None, + input_shape=_cfg["input_shape"], + ), + fpn_layers, ) + # Build the model + model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + + # Load pretrained parameters + if pretrained: + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) + + return model + + +def _db_mobilenet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained + + # Patch the config + _cfg = deepcopy(default_cfgs[arch]) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = default_cfgs[arch].get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor feat_extractor = IntermediateLayerGetter( - resnet, - _cfg['fpn_layers'], + backbone_fn( + input_shape=_cfg["input_shape"], + include_top=False, + pretrained=pretrained_backbone, + ), + fpn_layers, ) - kwargs['fpn_channels'] = _cfg['fpn_channels'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] - # Build the model model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model
-[docs] +[docs] def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture Returns: + ------- text detection architecture """ + return _db_resnet( + "db_resnet50", + pretrained, + ResNet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
+ + + +
+[docs] +def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet: + """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" + <https://arxiv.org/pdf/1911.08947.pdf>`_, using a mobilenet v3 large backbone. + + >>> import tensorflow as tf + >>> from doctr.models import db_mobilenet_v3_large + >>> model = db_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) - return _db_resnet('db_resnet50', pretrained, **kwargs)
+ Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture + + Returns: + ------- + text detection architecture + """ + return _db_mobilenet( + "db_mobilenet_v3_large", + pretrained, + mobilenet_v3_large, + ["inverted_2", "inverted_5", "inverted_11", "final_block"], + **kwargs, + )

@@ -598,8 +758,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo - - + + diff --git a/v0.5.1/_modules/doctr/models/detection/fast/tensorflow.html b/v0.5.1/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.5.1/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.5.1/_modules/doctr/models/detection/linknet.html b/v0.5.1/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.5.1/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.5.1/_modules/doctr/models/detection/linknet/tensorflow.html index cd4f446673..ce995f99d4 100644 --- a/v0.5.1/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.linknet.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.classification import resnet18, resnet34, resnet50
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.backbones import ResnetStage
-from doctr.models.utils import conv_sequence, load_pretrained_params
-from .base import LinkNetPostProcessor, _LinkNet
 
-__all__ = ['LinkNet', 'linknet16']
+from .base import LinkNetPostProcessor, _LinkNet
 
+__all__ = ["LinkNet", "linknet_resnet18", "linknet_resnet34", "linknet_resnet50"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet16': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'num_classes': 1,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': None,
+    "linknet_resnet18": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet18-615a82c5.weights.h5&src=0",
+    },
+    "linknet_resnet34": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet34-9d772be5.weights.h5&src=0",
+    },
+    "linknet_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet50-6bf6c8b5.weights.h5&src=0",
     },
 }
 
 
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
+def decoder_block(in_chan: int, out_chan: int, stride: int, **kwargs: Any) -> Sequential:
     """Creates a LinkNet decoder block"""
-
     return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
+        *conv_sequence(in_chan // 4, "relu", True, kernel_size=1, **kwargs),
         layers.Conv2DTranspose(
             filters=in_chan // 4,
             kernel_size=3,
-            strides=2,
+            strides=stride,
             padding="same",
             use_bias=False,
-            kernel_initializer='he_normal'
+            kernel_initializer="he_normal",
         ),
         layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
+        layers.Activation("relu"),
+        *conv_sequence(out_chan, "relu", True, kernel_size=1),
     ])
 
 
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module"""
+class LinkNetFPN(Model, NestedObject):
+    """LinkNet Decoder module"""
 
     def __init__(
         self,
+        out_chans: int,
+        in_shapes: List[Tuple[int, ...]],
     ) -> None:
-
         super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
+        self.out_chans = out_chans
+        strides = [2] * (len(in_shapes) - 1) + [1]
+        i_chans = [s[-1] for s in in_shapes[::-1]]
+        o_chans = i_chans[1:] + [out_chans]
+        self.decoders = [
+            decoder_block(in_chan, out_chan, s, input_shape=in_shape)
+            for in_chan, out_chan, s, in_shape in zip(i_chans, o_chans, strides, in_shapes[::-1])
+        ]
+
+    def call(self, x: List[tf.Tensor], **kwargs: Any) -> tf.Tensor:
+        out = 0
+        for decoder, fmap in zip(self.decoders, x[::-1]):
+            out = decoder(out + fmap, **kwargs)
+        return out
 
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(_LinkNet, keras.Model):
+    def extra_repr(self) -> str:
+        return f"out_chans={self.out_chans}"
+
+
+class LinkNet(_LinkNet, Model):
     """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
     <https://arxiv.org/pdf/1707.03718.pdf>`_.
 
     Args:
-        num_classes: number of channels for the output
+    ----
+        feature extractor: the backbone serving as feature extractor
+        fpn_channels: number of channels each extracted feature maps is mapped to
+        bin_thresh: threshold for binarization of the output feature map
+        box_thresh: minimal objectness score to consider a box
+        assume_straight_pages: if True, fit straight bounding boxes only
+        exportable: onnx exportable returns only logits
+        cfg: the configuration dict of the model
+        class_names: list of class names
     """
 
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
+    _children_names: List[str] = ["feat_extractor", "fpn", "classifier", "postprocessor"]
 
     def __init__(
         self,
-        num_classes: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        rotated_bbox: bool = False,
+        feat_extractor: IntermediateLayerGetter,
+        fpn_channels: int = 64,
+        bin_thresh: float = 0.1,
+        box_thresh: float = 0.1,
+        assume_straight_pages: bool = True,
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
+        class_names: List[str] = [CLASS_NAME],
     ) -> None:
         super().__init__(cfg=cfg)
 
-        self.rotated_bbox = rotated_bbox
+        self.class_names = class_names
+        num_classes: int = len(self.class_names)
 
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
+        self.exportable = exportable
+        self.assume_straight_pages = assume_straight_pages
+
+        self.feat_extractor = feat_extractor
 
-        self.fpn = LinkNetFPN()
+        self.fpn = LinkNetFPN(fpn_channels, [_shape[1:] for _shape in self.feat_extractor.output_shape])
+        self.fpn.build(self.feat_extractor.output_shape)
 
         self.classifier = Sequential([
             layers.Conv2DTranspose(
@@ -393,154 +442,246 @@ 

Source code for doctr.models.detection.linknet.tensorflow

strides=2, padding="same", use_bias=False, - kernel_initializer='he_normal' + kernel_initializer="he_normal", + input_shape=self.fpn.decoders[-1].output_shape[1:], ), layers.BatchNormalization(), - layers.Activation('relu'), - *conv_sequence(32, 'relu', True, strides=1, kernel_size=3), + layers.Activation("relu"), + *conv_sequence(32, "relu", True, kernel_size=3, strides=1), layers.Conv2DTranspose( filters=num_classes, kernel_size=2, strides=2, padding="same", - use_bias=False, - kernel_initializer='he_normal' + use_bias=True, + kernel_initializer="he_normal", ), ]) - self.postprocessor = LinkNetPostProcessor(rotated_bbox=rotated_bbox) + self.postprocessor = LinkNetPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh + ) def compute_loss( self, out_map: tf.Tensor, - target: List[Dict[str, Any]], - focal_loss: bool = False, - alpha: float = .5, - gamma: float = 2., - edge_factor: float = 2., + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute linknet loss, BCE with boosted box edges or focal loss. Focal loss implementation based on <https://github.com/tensorflow/addons/>`_. Args: + ---- out_map: output feature map of the model of shape N x H x W x 1 target: list of dictionary where each dict has a `boxes` and a `flags` entry - focal_loss: if True, use focal loss instead of BCE - edge_factor: boost factor for box edges (in case of BCE) + gamma: modulating factor in the focal loss formula alpha: balancing factor in the focal loss formula - gammma: modulating factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ - seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) - edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) + seg_target, seg_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - - # Get the cross_entropy for each entry - bce = tf.keras.losses.binary_crossentropy( - seg_target[seg_mask], - tf.squeeze(out_map, axis=[-1])[seg_mask], - from_logits=True) - - if focal_loss: - if gamma and gamma < 0: - raise ValueError("Value of gamma should be greater than or equal to zero.") - - # Convert logits to prob, compute gamma factor - pred_prob = tf.sigmoid(tf.squeeze(out_map, axis=[-1])[seg_mask]) - p_t = (seg_target[seg_mask] * pred_prob) + ((1 - seg_target[seg_mask]) * (1 - pred_prob)) - modulating_factor = tf.pow((1.0 - p_t), gamma) - - # Compute alpha factor - alpha_factor = seg_target[seg_mask] * alpha + (1 - seg_target[seg_mask]) * (1 - alpha) - - # compute the final loss - loss = tf.reduce_mean(alpha_factor * modulating_factor * bce) - - else: - # Compute BCE loss with highlighted edges - loss = tf.math.multiply( - 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), - bce - ) - loss = tf.reduce_mean(loss) - - return loss + seg_mask = tf.cast(seg_mask, tf.float32) + + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + proba_map = tf.sigmoid(out_map) + + # Focal loss + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") + # Convert logits to prob, compute gamma factor + p_t = (seg_target * proba_map) + ((1 - seg_target) * (1 - proba_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class + dice_map = tf.nn.softmax(out_map, axis=-1) if len(self.class_names) > 1 else proba_map + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) + + return focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, - focal_loss: bool = True, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - - logits = self.stem(x) - logits = self.fpn(logits) - logits = self.classifier(logits) + feat_maps = self.feat_extractor(x, **kwargs) + logits = self.fpn(feat_maps, **kwargs) + logits = self.classifier(logits, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) + if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: + if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: - loss = self.compute_loss(logits, target, focal_loss) - out['loss'] = loss + loss = self.compute_loss(logits, target) + out["loss"] = loss return out -def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet: +def _linknet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> LinkNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor + feat_extractor = IntermediateLayerGetter( + backbone_fn( + pretrained=pretrained_backbone, + include_top=False, + input_shape=_cfg["input_shape"], + ), + fpn_layers, + ) - kwargs['num_classes'] = _cfg['num_classes'] - kwargs['input_shape'] = _cfg['input_shape'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model - model = LinkNet(cfg=_cfg, **kwargs) + model = LinkNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model -
-[docs] -def linknet16(pretrained: bool = False, **kwargs: Any) -> LinkNet: +
+[docs] +def linknet_resnet18(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet18 + >>> model = linknet_resnet18(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture + + Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet18", + pretrained, + resnet18, + ["resnet_block_1", "resnet_block_3", "resnet_block_5", "resnet_block_7"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet34(pretrained: bool = False, **kwargs: Any) -> LinkNet: """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" <https://arxiv.org/pdf/1707.03718.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet16 - >>> model = linknet16(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet34 + >>> model = linknet_resnet34(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture Returns: + ------- text detection architecture """ + return _linknet( + "linknet_resnet34", + pretrained, + resnet34, + ["resnet_block_2", "resnet_block_6", "resnet_block_12", "resnet_block_15"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet50(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet50 + >>> model = linknet_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture - return _linknet('linknet16', pretrained, **kwargs)
+ Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet50", + pretrained, + resnet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
@@ -574,8 +715,8 @@

Source code for doctr.models.detection.linknet.tensorflow

- +
+ diff --git a/v0.5.1/_modules/doctr/models/detection/zoo.html b/v0.5.1/_modules/doctr/models/detection/zoo.html index d3128b8d14..3651c4e2d3 100644 --- a/v0.5.1/_modules/doctr/models/detection/zoo.html +++ b/v0.5.1/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
 from doctr.file_utils import is_tf_available, is_torch_available
-from .core import DetectionPredictor
-from ..preprocessor import PreProcessor
-from .. import detection
 
+from .. import detection
+from ..detection.fast import reparameterize
+from ..preprocessor import PreProcessor
+from .predictor import DetectionPredictor
 
 __all__ = ["detection_predictor"]
 
+ARCHS: List[str]
+
 
 if is_tf_available():
-    ARCHS = ['db_resnet50', 'linknet16']
+    ARCHS = [
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
 elif is_torch_available():
-    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
+    ARCHS = [
+        "db_resnet34",
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
+
 
+def _predictor(arch: Any, pretrained: bool, assume_straight_pages: bool = True, **kwargs: Any) -> DetectionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
+        _model = detection.__dict__[arch](
+            pretrained=pretrained,
+            pretrained_backbone=kwargs.get("pretrained_backbone", True),
+            assume_straight_pages=assume_straight_pages,
+        )
+        # Reparameterize FAST models by default to lower inference latency and memory usage
+        if isinstance(_model, detection.FAST):
+            _model = reparameterize(_model)
+    else:
+        if not isinstance(arch, (detection.DBNet, detection.LinkNet, detection.FAST)):
+            raise ValueError(f"unknown architecture: {type(arch)}")
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+        _model = arch
+        _model.assume_straight_pages = assume_straight_pages
+        _model.postprocessor.assume_straight_pages = assume_straight_pages
 
-    # Detection
-    _model = detection.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 1)
+    kwargs.pop("pretrained_backbone", None)
+
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 2)
     predictor = DetectionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
-        _model
+        PreProcessor(_model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:], **kwargs),
+        _model,
     )
     return predictor
 
 
 
-[docs] -def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: +[docs] +def detection_predictor( + arch: Any = "fast_base", + pretrained: bool = False, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + batch_size: int = 2, + **kwargs: Any, +) -> DetectionPredictor: """Text detection architecture. - Example:: - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> model = detection_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import detection_predictor + >>> model = detection_predictor(arch='db_resnet50', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_resnet50') + ---- + arch: name of the architecture or model itself to use (e.g. 'db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset + assume_straight_pages: If True, fit straight boxes to the page + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right + batch_size: number of samples the model processes in parallel + **kwargs: optional keyword arguments passed to the architecture Returns: + ------- Detection predictor """ - - return _predictor(arch, pretrained, **kwargs)
+ return _predictor( + arch=arch, + pretrained=pretrained, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + batch_size=batch_size, + **kwargs, + )
@@ -367,8 +449,8 @@

Source code for doctr.models.detection.zoo

       
     
   
- - + + diff --git a/v0.5.1/_modules/doctr/models/export.html b/v0.5.1/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.5.1/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/models/factory/hub.html b/v0.5.1/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/v0.5.1/_modules/doctr/models/factory/hub.html +++ b/v0.5.1/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.5.1/_modules/doctr/models/recognition/crnn.html b/v0.5.1/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.5.1/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.5.1/_modules/doctr/models/recognition/crnn/tensorflow.html index 41cc93dd23..bc64da9a1b 100644 --- a/v0.5.1/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.crnn.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
+
 import tensorflow as tf
 from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential, Model
-from typing import Tuple, Dict, Any, Optional, List
+from tensorflow.keras.models import Model, Sequential
+
+from doctr.datasets import VOCABS
 
-from ... import backbones
-from ...utils import load_pretrained_params
+from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
+__all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
+    "crnn_vgg16_bn": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["legacy_french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_vgg16_bn-9c188f45.weights.h5&src=0",
     },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
+    "crnn_mobilenet_v3_small": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_small-54850265.weights.h5&src=0",
+    },
+    "crnn_mobilenet_v3_large": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_large-c64045e5.weights.h5&src=0",
     },
 }
 
 
 class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
+    """Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
 
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
         ignore_case: if True, ignore case of letters
         ignore_accents: if True, ignore accents of letters
@@ -325,37 +353,57 @@ 

Source code for doctr.models.recognition.crnn.tensorflow

def __call__( self, - logits: tf.Tensor - ) -> List[Tuple[str, float]]: - """ - Performs decoding of raw output with CTC and decoding of CTC predictions + logits: tf.Tensor, + beam_width: int = 1, + top_paths: int = 1, + ) -> Union[List[Tuple[str, float]], List[Tuple[List[str], List[float]]]]: + """Performs decoding of raw output with CTC and decoding of CTC predictions with label_to_idx mapping dictionnary Args: + ---- logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1 + beam_width: An int scalar >= 0 (beam search beam width). + top_paths: An int scalar >= 0, <= beam_width (controls output size). Returns: + ------- A list of decoded words of length BATCH_SIZE + """ # Decode CTC _decoded, _log_prob = tf.nn.ctc_beam_search_decoder( tf.transpose(logits, perm=[1, 0, 2]), - tf.fill(logits.shape[0], logits.shape[1]), - beam_width=1, top_paths=1, + tf.fill(tf.shape(logits)[:1], tf.shape(logits)[1]), + beam_width=beam_width, + top_paths=top_paths, ) - out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab)) - probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) + + _decoded = tf.sparse.concat( + 1, + [tf.sparse.expand_dims(dec, axis=1) for dec in _decoded], + expand_nonconcat_dims=True, + ) # dim : batchsize x beamwidth x actual_max_len_predictions + out_idxs = tf.sparse.to_dense(_decoded, default_value=len(self.vocab)) # Map it to characters _decoded_strings_pred = tf.strings.reduce_join( inputs=tf.nn.embedding_lookup(tf.constant(self._embedding, dtype=tf.string), out_idxs), - axis=-1 + axis=-1, ) _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] - word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - + decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value="not valid")[ + :, :, 0 + ] # dim : batch_size x beam_width + + if top_paths == 1: + probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) # dim : batchsize + decoded_strings_pred = tf.squeeze(decoded_strings_pred, axis=1) + word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] + else: + probs = tf.math.exp(_log_prob) # dim : batchsize x beamwidth + word_values = [[word.decode() for word in words] for words in decoded_strings_pred.numpy().tolist()] return list(zip(word_values, probs.numpy().tolist())) @@ -364,19 +412,26 @@

Source code for doctr.models.recognition.crnn.tensorflow

Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of units in the LSTM layers + exportable: onnx exportable returns only logits + beam_width: beam width for beam search decoding + top_paths: number of top paths for beam search decoding cfg: configuration dictionary """ - _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "decoder", "postprocessor"] def __init__( self, - feature_extractor: tf.keras.Model, + feature_extractor: Model, vocab: str, rnn_units: int = 128, + exportable: bool = False, + beam_width: int = 1, + top_paths: int = 1, cfg: Optional[Dict[str, Any]] = None, ) -> None: # Initialize kernels @@ -386,19 +441,21 @@

Source code for doctr.models.recognition.crnn.tensorflow

self.vocab = vocab self.max_length = w self.cfg = cfg + self.exportable = exportable self.feat_extractor = feature_extractor - self.decoder = Sequential( - [ - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Dense(units=len(vocab) + 1) - ] - ) + self.decoder = Sequential([ + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Dense(units=len(vocab) + 1), + ]) self.decoder.build(input_shape=(None, w, h * c)) self.postprocessor = CTCPostProcessor(vocab=vocab) + self.beam_width = beam_width + self.top_paths = top_paths + def compute_loss( self, model_output: tf.Tensor, @@ -407,16 +464,17 @@

Source code for doctr.models.recognition.crnn.tensorflow

"""Compute CTC loss for the model. Args: - gt: the encoded tensor with gt labels + ---- model_output: predicted logits of the model - seq_len: lengths of each gt word inside the batch + target: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) batch_len = model_output.shape[0] - input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) + input_length = tf.fill((batch_len,), model_output.shape[1]) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -428,8 +486,12 @@

Source code for doctr.models.recognition.crnn.tensorflow

target: Optional[List[str]] = None, return_model_output: bool = False, return_preds: bool = False, + beam_width: int = 1, + top_paths: int = 1, **kwargs: Any, ) -> Dict[str, Any]: + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") features = self.feat_extractor(x, **kwargs) # B x H x W x C --> B x W x H x C @@ -437,91 +499,132 @@

Source code for doctr.models.recognition.crnn.tensorflow

w, h, c = transposed_feat.get_shape().as_list()[1:] # B x W x H x C --> B x W x H * C features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c)) - logits = self.decoder(features_seq, **kwargs) + logits = _bf16_to_float32(self.decoder(features_seq, **kwargs)) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = logits + return out + if return_model_output: out["out_map"] = logits if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(logits) + out["preds"] = self.postprocessor(logits, beam_width=beam_width, top_paths=top_paths) if target is not None: - out['loss'] = self.compute_loss(logits, target) + out["loss"] = self.compute_loss(logits, target) return out -def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: +def _crnn( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> CRNN: + pretrained_backbone = pretrained_backbone and not pretrained + + kwargs["vocab"] = kwargs.get("vocab", default_cfgs[arch]["vocab"]) - # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) + _cfg["vocab"] = kwargs["vocab"] + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] - # Feature extractor - feat_extractor = backbones.__dict__[_cfg['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + input_shape=_cfg["input_shape"], include_top=False, + pretrained=pretrained_backbone, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - # Build the model model = CRNN(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params(model, _cfg["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]) return model
-[docs] +[docs] def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_vgg16_bn + >>> model = crnn_vgg16_bn(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_vgg16_bn", pretrained, vgg16_bn_r, **kwargs)
+ + + +
+[docs] +def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based + Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_small + >>> model = crnn_mobilenet_v3_small(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
+ Returns: + ------- + text recognition architecture + """ + return _crnn("crnn_mobilenet_v3_small", pretrained, mobilenet_v3_small_r, **kwargs)
-def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based +
+[docs] +def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_large + >>> model = crnn_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_mobilenet_v3_large", pretrained, mobilenet_v3_large_r, **kwargs)
- return _crnn('crnn_resnet31', pretrained, **kwargs)
@@ -554,8 +657,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

- +
+ diff --git a/v0.5.1/_modules/doctr/models/recognition/master/tensorflow.html b/v0.5.1/_modules/doctr/models/recognition/master/tensorflow.html index 2dc5a27717..aa6aa69325 100644 --- a/v0.5.1/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.master.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import tensorflow as tf
-from tensorflow.keras import layers, Sequential, Model
-from typing import Tuple, List, Dict, Any, Optional
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
 
-from ..core import RecognitionPostProcessor
-from ...backbones.resnet import ResnetStage
-from ...utils import conv_sequence, load_pretrained_params
-from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask
-from ....datasets import VOCABS
-from .base import _MASTER, _MASTERPostProcessor
+import tensorflow as tf
+from tensorflow.keras import Model, layers
+
+from doctr.datasets import VOCABS
+from doctr.models.classification import magc_resnet31
+from doctr.models.modules.transformer import Decoder, PositionalEncoding
 
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from .base import _MASTER, _MASTERPostProcessor
 
-__all__ = ['MASTER', 'master', 'MASTERPostProcessor']
+__all__ = ["MASTER", "master"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'master': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'input_shape': (48, 160, 3),
-        'vocab': VOCABS['french'],
-        'url': None,
+    "master": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/master-d7fdaeff.weights.h5&src=0",
     },
 }
 
 
-class MAGC(layers.Layer):
-
-    """Implements the Multi-Aspect Global Context Attention, as described in
-    <https://arxiv.org/pdf/1910.02562.pdf>`_.
-
-    Args:
-        inplanes: input channels
-        headers: number of headers to split channels
-        att_scale: if True, re-scale attention to counteract the variance distibutions
-        **kwargs
-    """
-
-    def __init__(
-        self,
-        inplanes: int,
-        headers: int = 1,
-        att_scale: bool = False,
-        **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-
-        self.headers = headers  # h
-        self.inplanes = inplanes  # C
-        self.att_scale = att_scale
-
-        self.single_header_inplanes = int(inplanes / headers)  # C / h
-
-        self.conv_mask = tf.keras.layers.Conv2D(
-            filters=1,
-            kernel_size=1,
-            kernel_initializer=tf.initializers.he_normal()
-        )
-
-        self.transform = tf.keras.Sequential(
-            [
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-                tf.keras.layers.LayerNormalization([1, 2, 3]),
-                tf.keras.layers.ReLU(),
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-            ],
-            name='transform'
-        )
-
-    @tf.function
-    def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor:
-        b, h, w, c = (tf.shape(inputs)[i] for i in range(4))
-
-        # B, H, W, C -->> B*h, H, W, C/h
-        x = tf.reshape(inputs, shape=(b, h, w, self.headers, self.single_header_inplanes))
-        x = tf.transpose(x, perm=(0, 3, 1, 2, 4))
-        x = tf.reshape(x, shape=(b * self.headers, h, w, self.single_header_inplanes))
-
-        # Compute shorcut
-        shortcut = x
-        # B*h, 1, H*W, C/h
-        shortcut = tf.reshape(shortcut, shape=(b * self.headers, 1, h * w, self.single_header_inplanes))
-        # B*h, 1, C/h, H*W
-        shortcut = tf.transpose(shortcut, perm=[0, 1, 3, 2])
-
-        # Compute context mask
-        # B*h, H, W, 1,
-        context_mask = self.conv_mask(x)
-        # B*h, 1, H*W, 1
-        context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1))
-        # scale variance
-        if self.att_scale and self.headers > 1:
-            context_mask = context_mask / tf.sqrt(self.single_header_inplanes)
-        # B*h, 1, H*W, 1
-        context_mask = tf.keras.activations.softmax(context_mask, axis=2)
-
-        # Compute context
-        # B*h, 1, C/h, 1
-        context = tf.matmul(shortcut, context_mask)
-        context = tf.reshape(context, shape=(b, 1, c, 1))
-        # B, 1, 1, C
-        context = tf.transpose(context, perm=(0, 1, 3, 2))
-        # Set shape to resolve shape when calling this module in the Sequential MAGCResnet
-        batch, chan = inputs.get_shape().as_list()[0], inputs.get_shape().as_list()[-1]
-        context.set_shape([batch, 1, 1, chan])
-        return context
-
-    def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
-        # Context modeling: B, H, W, C  ->  B, 1, 1, C
-        context = self.context_modeling(inputs)
-        # Transform: B, 1, 1, C  ->  B, 1, 1, C
-        transformed = self.transform(context)
-        return inputs + transformed
-
-
-class MAGCResnet(Sequential):
-
-    """Implements the modified resnet with MAGC layers, as described in paper.
-
-    Args:
-        headers: number of header to split channels in MAGC layers
-        input_shape: shape of the model input (without batch dim)
-    """
-
-    def __init__(
-        self,
-        headers: int = 1,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
-    ) -> None:
-        _layers = [
-            # conv_1x
-            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
-            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_2x
-            ResnetStage(num_blocks=1, output_channels=256),
-            MAGC(inplanes=256, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=256, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_3x
-            ResnetStage(num_blocks=2, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 1), (2, 1)),
-            # conv_4x
-            ResnetStage(num_blocks=5, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            # conv_5x
-            ResnetStage(num_blocks=3, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-        ]
-        super().__init__(_layers)
-
-
 class MASTER(_MASTER, Model):
-
     """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
     Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_.
 
     Args:
+    ----
+        feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary, (without EOS, SOS, PAD)
         d_model: d parameter for the transformer decoder
-        headers: headers for the MAGC module
         dff: depth of the pointwise feed-forward layer
         num_heads: number of heads for the mutli-head attention module
         num_layers: number of decoder layers to stack
         max_length: maximum length of character sequence handled by the model
-        input_size: size of the image inputs
+        dropout: dropout probability of the decoder
+        input_shape: size of the image inputs
+        exportable: onnx exportable returns only logits
+        cfg: dictionary containing information about the model
     """
 
     def __init__(
         self,
+        feature_extractor: Model,
         vocab: str,
         d_model: int = 512,
-        headers: int = 1,
         dff: int = 2048,
-        num_heads: int = 8,
+        num_heads: int = 8,  # number of heads in the transformer decoder
         num_layers: int = 3,
         max_length: int = 50,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
+        dropout: float = 0.2,
+        input_shape: Tuple[int, int, int] = (32, 128, 3),  # different from the paper
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__()
 
-        self.vocab = vocab
+        self.exportable = exportable
         self.max_length = max_length
+        self.d_model = d_model
+        self.vocab = vocab
         self.cfg = cfg
         self.vocab_size = len(vocab)
 
-        self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape)
-        self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model)  # 3 more classes: EOS/PAD/SOS
+        self.feat_extractor = feature_extractor
+        self.positional_encoding = PositionalEncoding(self.d_model, dropout, max_len=input_shape[0] * input_shape[1])
 
         self.decoder = Decoder(
             num_layers=num_layers,
-            d_model=d_model,
+            d_model=self.d_model,
             num_heads=num_heads,
+            vocab_size=self.vocab_size + 3,  # EOS, SOS, PAD
             dff=dff,
-            vocab_size=self.vocab_size,
-            maximum_position_encoding=max_length,
+            dropout=dropout,
+            maximum_position_encoding=self.max_length,
         )
-        self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model)
-        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
 
+        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
         self.postprocessor = MASTERPostProcessor(vocab=self.vocab)
 
     @tf.function
-    def make_mask(self, target: tf.Tensor) -> tf.Tensor:
-        look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
-        target_padding_mask = create_padding_mask(target, self.vocab_size + 2)  # Pad symbol
-        combined_mask = tf.maximum(target_padding_mask, look_ahead_mask)
-        return combined_mask
+    def make_source_and_target_mask(self, source: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+        # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
+        # (N, 1, 1, max_length)
+        target_pad_mask = tf.cast(tf.math.not_equal(target, self.vocab_size + 2), dtype=tf.uint8)
+        target_pad_mask = target_pad_mask[:, tf.newaxis, tf.newaxis, :]
+        target_length = target.shape[1]
+        # sub mask filled diagonal with 1 = see 0 = masked (max_length, max_length)
+        target_sub_mask = tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
+        # source mask filled with ones (max_length, positional_encoded_seq_len)
+        source_mask = tf.ones((target_length, source.shape[1]))
+        # combine the two masks into one boolean mask where False is masked (N, 1, max_length, max_length)
+        target_mask = tf.math.logical_and(
+            tf.cast(target_sub_mask, dtype=tf.bool), tf.cast(target_pad_mask, dtype=tf.bool)
+        )
+        return source_mask, target_mask
 
+    @staticmethod
     def compute_loss(
-        self,
         model_output: tf.Tensor,
         gt: tf.Tensor,
         seq_len: List[int],
@@ -512,11 +413,13 @@ 

Source code for doctr.models.recognition.master.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -532,7 +435,7 @@

Source code for doctr.models.recognition.master.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) @@ -547,94 +450,103 @@

Source code for doctr.models.recognition.master.tensorflow

"""Call function for training Args: + ---- x: images target: list of str labels return_model_output: if True, return logits return_preds: if True, decode logits + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A dictionnary containing eventually loss, logits and predictions. """ - # Encode - feature = self.feature_extractor(x, **kwargs) - b, h, w, c = (tf.shape(feature)[i] for i in range(4)) + feature = self.feat_extractor(x, **kwargs) + b, h, w, c = feature.get_shape() + # (N, H, W, C) --> (N, H * W, C) feature = tf.reshape(feature, shape=(b, h * w, c)) - encoded = feature + self.feature_pe[:, :h * w, :] + # add positional encoding to features + encoded = self.positional_encoding(feature, **kwargs) out: Dict[str, tf.Tensor] = {} + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") + if target is not None: # Compute target: tensor of gts and sequence lengths - gt, seq_len = self.compute_target(target) - - if kwargs.get('training', False): - if target is None: - raise AssertionError("In training mode, you need to pass a value to 'target'") - tgt_mask = self.make_mask(gt) + gt, seq_len = self.build_target(target) + # Compute decoder masks + source_mask, target_mask = self.make_source_and_target_mask(encoded, gt) # Compute logits - output = self.decoder(gt, encoded, tgt_mask, None, **kwargs) + output = self.decoder(gt, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) - else: - # When not training, we want to compute logits in with the decoder, although - # we have access to gts (we need gts to compute the loss, but not in the decoder) logits = self.decode(encoded, **kwargs) + logits = _bf16_to_float32(logits) + + if self.exportable: + out["logits"] = logits + return out + if target is not None: - out['loss'] = self.compute_loss(logits, gt, seq_len) + out["loss"] = self.compute_loss(logits, gt, seq_len) if return_model_output: - out['out_map'] = logits + out["out_map"] = logits if return_preds: - predictions = self.postprocessor(logits) - out['preds'] = predictions + out["preds"] = self.postprocessor(logits) return out + @tf.function def decode(self, encoded: tf.Tensor, **kwargs: Any) -> tf.Tensor: """Decode function for prediction Args: + ---- encoded: encoded features + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A Tuple of tf.Tensor: predictions, logits """ - b = tf.shape(encoded)[0] - max_len = tf.constant(self.max_length, dtype=tf.int32) + b = encoded.shape[0] + start_symbol = tf.constant(self.vocab_size + 1, dtype=tf.int32) # SOS padding_symbol = tf.constant(self.vocab_size + 2, dtype=tf.int32) # PAD - ys = tf.fill(dims=(b, max_len - 1), value=padding_symbol) + ys = tf.fill(dims=(b, self.max_length - 1), value=padding_symbol) start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols - # max_len = len + 2 (sos + eos) + # Final dimension include EOS/SOS/PAD for i in range(self.max_length - 1): - ys_mask = self.make_mask(ys) - output = self.decoder(ys, encoded, ys_mask, None, **kwargs) + source_mask, target_mask = self.make_source_and_target_mask(encoded, ys) + output = self.decoder(ys, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) prob = tf.nn.softmax(logits, axis=-1) - next_word = tf.argmax(prob, axis=-1, output_type=ys.dtype) - # ys.shape = B, T - i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(max_len), indexing='ij') + next_token = tf.argmax(prob, axis=-1, output_type=ys.dtype) + # update ys with the next token and ignore the first token (SOS) + i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(self.max_length), indexing="ij") indices = tf.stack([i_mesh[:, i + 1], j_mesh[:, i + 1]], axis=1) - ys = tf.tensor_scatter_nd_update(ys, indices, next_word[:, i + 1]) + ys = tf.tensor_scatter_nd_update(ys, indices, next_token[:, i]) - # final_logits of shape (N, max_length - 1, vocab_size + 1) (whithout sos) + # Shape (N, max_length, vocab_size + 1) return logits class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures + Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -649,51 +561,66 @@

Source code for doctr.models.recognition.master.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _master(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> MASTER: +def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool = True, **kwargs: Any) -> MASTER: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"]) + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) - kwargs['vocab'] = _cfg['vocab'] + kwargs["vocab"] = _cfg["vocab"] + kwargs["input_shape"] = _cfg["input_shape"] # Build the model - model = MASTER(cfg=_cfg, **kwargs) + model = MASTER( + backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg["input_shape"], include_top=False), + cfg=_cfg, + **kwargs, + ) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model
-[docs] +[docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import master - >>> model = master(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + + >>> import tensorflow as tf + >>> from doctr.models import master + >>> model = master(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keywoard arguments passed to the MASTER architecture + Returns: + ------- text recognition architecture """ - - return _master('master', pretrained, **kwargs)
+ return _master("master", pretrained, magc_resnet31, **kwargs)
@@ -727,8 +654,8 @@

Source code for doctr.models.recognition.master.tensorflow

- +
+ diff --git a/v0.5.1/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.5.1/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/v0.5.1/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.5.1/_modules/doctr/models/recognition/sar.html b/v0.5.1/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.5.1/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.5.1/_modules/doctr/models/recognition/sar/tensorflow.html index e514e4f0c4..4a591e6451 100644 --- a/v0.5.1/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.sar.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
+
 import tensorflow as tf
-from tensorflow.keras import Sequential, layers, Model
-from typing import Tuple, Dict, List, Any, Optional
+from tensorflow.keras import Model, Sequential, layers
 
-from ... import backbones
-from ...utils import load_pretrained_params
-from ..core import RecognitionModel, RecognitionPostProcessor
+from doctr.datasets import VOCABS
 from doctr.utils.repr import NestedObject
 
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
+from ...classification import resnet31
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from ..core import RecognitionModel, RecognitionPostProcessor
+
+__all__ = ["SAR", "sar_resnet31"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
+    "sar_resnet31": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/sar_resnet31-5a58806c.weights.h5&src=0",
     },
 }
 
 
+class SAREncoder(layers.Layer, NestedObject):
+    """Implements encoder module of the SAR model
+
+    Args:
+    ----
+        rnn_units: number of hidden rnn units
+        dropout_prob: dropout probability
+    """
+
+    def __init__(self, rnn_units: int, dropout_prob: float = 0.0) -> None:
+        super().__init__()
+        self.rnn = Sequential([
+            layers.LSTM(units=rnn_units, return_sequences=True, recurrent_dropout=dropout_prob),
+            layers.LSTM(units=rnn_units, return_sequences=False, recurrent_dropout=dropout_prob),
+        ])
+
+    def call(
+        self,
+        x: tf.Tensor,
+        **kwargs: Any,
+    ) -> tf.Tensor:
+        # (N, C)
+        return self.rnn(x, **kwargs)
+
+
 class AttentionModule(layers.Layer, NestedObject):
     """Implements attention module of the SAR model
 
     Args:
+    ----
         attention_units: number of hidden attention units
 
     """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
 
+    def __init__(self, attention_units: int) -> None:
         super().__init__()
         self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            3,
+            strides=1,
+            use_bias=True,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
+            1,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.flatten = layers.Flatten()
 
@@ -343,12 +395,12 @@ 

Source code for doctr.models.recognition.sar.tensorflow

hidden_state: tf.Tensor, **kwargs: Any, ) -> tf.Tensor: - [H, W] = features.get_shape().as_list()[1:3] - # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) - hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) # shape (N, H, W, vgg_units) -> (N, H, W, attention_units) features_projection = self.features_projector(features, **kwargs) + # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) + hidden_state = tf.expand_dims(tf.expand_dims(hidden_state, axis=1), axis=1) + hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) projection = tf.math.tanh(hidden_state_projection + features_projection) # shape (N, H, W, attention_units) -> (N, H, W, 1) attention = self.attention_projector(projection, **kwargs) @@ -358,23 +410,25 @@

Source code for doctr.models.recognition.sar.tensorflow

# shape (N, H * W) -> (N, H, W, 1) attention_map = tf.reshape(attention, [-1, H, W, 1]) glimpse = tf.math.multiply(features, attention_map) - # shape (N, H * W) -> (N, 1) - glimpse = tf.reduce_sum(glimpse, axis=[1, 2]) - return glimpse + # shape (N, H * W) -> (N, C) + return tf.reduce_sum(glimpse, axis=[1, 2]) class SARDecoder(layers.Layer, NestedObject): """Implements decoder module of the SAR model Args: + ---- rnn_units: number of hidden units in recurrent cells max_length: maximum length of a sequence vocab_size: number of classes in the model alphabet embedding_units: number of hidden embedding units attention_units: number of hidden attention units - num_decoder_layers: number of LSTM layers to stack + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability """ + def __init__( self, rnn_units: int, @@ -382,23 +436,22 @@

Source code for doctr.models.recognition.sar.tensorflow

vocab_size: int, embedding_units: int, attention_units: int, - num_decoder_layers: int = 2, - input_shape: Optional[List[Tuple[Optional[int]]]] = None, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, ) -> None: - super().__init__() self.vocab_size = vocab_size - self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] - ) - self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) - self.attention_module = AttentionModule(attention_units) - self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units)) self.max_length = max_length - # Initialize kernels - if input_shape is not None: - self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units))) + self.embed = layers.Dense(embedding_units, use_bias=False) + self.embed_tgt = layers.Embedding(embedding_units, self.vocab_size + 1) + + self.lstm_cells = layers.StackedRNNCells([ + layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_cells) + ]) + self.attention_module = AttentionModule(attention_units) + self.output_dense = layers.Dense(self.vocab_size + 1, use_bias=True) + self.dropout = layers.Dropout(dropout_prob) def call( self, @@ -407,40 +460,47 @@

Source code for doctr.models.recognition.sar.tensorflow

gt: Optional[tf.Tensor] = None, **kwargs: Any, ) -> tf.Tensor: - - # initialize states (each of shape (N, rnn_units)) - states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=tf.float32 - ) - # run first step of lstm - # holistic: shape (N, rnn_units) - _, states = self.lstm_decoder(holistic, states, **kwargs) - # Initialize with the index of virtual START symbol (placed after <eos>) - symbol = tf.fill(features.shape[0], self.vocab_size + 1) - logits_list = [] - if kwargs.get('training') and gt is None: - raise ValueError('Need to provide labels during training for teacher forcing') - for t in range(self.max_length + 1): # keep 1 step for <eos> - # one-hot symbol with depth vocab_size + 1 - # embeded_symbol: shape (N, embedding_units) - embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs) - logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs) - glimpse = self.attention_module( - features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs, - ) - # logits: shape (N, rnn_units), glimpse: shape (N, 1) - logits = tf.concat([logits, glimpse], axis=-1) - # shape (N, rnn_units + 1) -> (N, vocab_size + 1) - logits = self.output_dense(logits, **kwargs) - # update symbol with predicted logits for t+1 step - if kwargs.get('training'): - symbol = gt[:, t] # type: ignore[index] + if gt is not None: + gt_embedding = self.embed_tgt(gt, **kwargs) + + logits_list: List[tf.Tensor] = [] + + for t in range(self.max_length + 1): # 32 + if t == 0: + # step to init the first states of the LSTMCell + states = self.lstm_cells.get_initial_state( + inputs=None, batch_size=features.shape[0], dtype=features.dtype + ) + prev_symbol = holistic + elif t == 1: + # step to init a 'blank' sequence of length vocab_size + 1 filled with zeros + # (N, vocab_size + 1) --> (N, embedding_units) + prev_symbol = tf.zeros([features.shape[0], self.vocab_size + 1], dtype=features.dtype) + prev_symbol = self.embed(prev_symbol, **kwargs) else: - symbol = tf.argmax(logits, axis=-1) - logits_list.append(logits) - outputs = tf.stack(logits_list, axis=1) # shape (N, max_length + 1, vocab_size + 1) - - return outputs + if gt is not None and kwargs.get("training", False): + # (N, embedding_units) -2 because of <bos> and <eos> (same) + prev_symbol = self.embed(gt_embedding[:, t - 2], **kwargs) + else: + # -1 to start at timestep where prev_symbol was initialized + index = tf.argmax(logits_list[t - 1], axis=-1) + # update prev_symbol with ones at the index of the previous logit vector + prev_symbol = self.embed(self.embed_tgt(index, **kwargs), **kwargs) + + # (N, C), (N, C) take the last hidden state and cell state from current timestep + _, states = self.lstm_cells(prev_symbol, states, **kwargs) + # states = (hidden_state, cell_state) + hidden_state = states[0][0] + # (N, H, W, C), (N, C) --> (N, C) + glimpse = self.attention_module(features, hidden_state, **kwargs) + # (N, C), (N, C) --> (N, 2 * C) + logits = tf.concat([hidden_state, glimpse], axis=1) + logits = self.dropout(logits, **kwargs) + # (N, vocab_size + 1) + logits_list.append(self.output_dense(logits, **kwargs)) + + # (max_length + 1, N, vocab_size + 1) --> (N, max_length + 1, vocab_size + 1) + return tf.transpose(tf.stack(logits_list[1:]), (1, 0, 2)) class SAR(Model, RecognitionModel): @@ -448,17 +508,20 @@

Source code for doctr.models.recognition.sar.tensorflow

Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of hidden units in both encoder and decoder LSTM embedding_units: number of embedding units attention_units: number of hidden units in attention module max_length: maximum word length handled by the model - num_decoders: number of LSTM to stack in decoder layer - + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability for the encoder and decoder + exportable: onnx exportable returns only logits + cfg: dictionary containing information about the model """ - _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "encoder", "decoder", "postprocessor"] def __init__( self, @@ -468,36 +531,34 @@

Source code for doctr.models.recognition.sar.tensorflow

embedding_units: int = 512, attention_units: int = 512, max_length: int = 30, - num_decoders: int = 2, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: - super().__init__() self.vocab = vocab + self.exportable = exportable self.cfg = cfg - self.max_length = max_length + 1 # Add 1 timestep for EOS after the longest word self.feat_extractor = feature_extractor - self.encoder = Sequential( - [ - layers.LSTM(units=rnn_units, return_sequences=True), - layers.LSTM(units=rnn_units, return_sequences=False) - ] - ) - # Initialize the kernels (watch out for reduce_max) - self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:]) - + self.encoder = SAREncoder(rnn_units, dropout_prob) self.decoder = SARDecoder( - rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders, - input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape] + rnn_units, + self.max_length, + len(vocab), + embedding_units, + attention_units, + num_decoder_cells, + dropout_prob, ) self.postprocessor = SARPostProcessor(vocab=vocab) + @staticmethod def compute_loss( - self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: tf.Tensor, @@ -506,11 +567,13 @@

Source code for doctr.models.recognition.sar.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -525,7 +588,7 @@

Source code for doctr.models.recognition.sar.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -536,16 +599,28 @@

Source code for doctr.models.recognition.sar.tensorflow

return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - features = self.feat_extractor(x, **kwargs) - pooled_features = tf.reduce_max(features, axis=1) # vertical max pooling + # vertical max pooling --> (N, C, W) + pooled_features = tf.reduce_max(features, axis=1) + # holistic (N, C) encoded = self.encoder(pooled_features, **kwargs) + if target is not None: - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) seq_len = tf.cast(seq_len, tf.int32) - decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training for teacher forcing") + + decoded_features = _bf16_to_float32( + self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + ) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = decoded_features + return out + if return_model_output: out["out_map"] = decoded_features @@ -554,7 +629,7 @@

Source code for doctr.models.recognition.sar.tensorflow

out["preds"] = self.postprocessor(decoded_features) if target is not None: - out['loss'] = self.compute_loss(decoded_features, gt, seq_len) + out["loss"] = self.compute_loss(decoded_features, gt, seq_len) return out @@ -563,9 +638,8 @@

Source code for doctr.models.recognition.sar.tensorflow

"""Post processor for SAR architectures Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -580,95 +654,75 @@

Source code for doctr.models.recognition.sar.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: +def _sar( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> SAR: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) - _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units']) - _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units']) - _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length']) - _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) # Feature extractor - feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + pretrained=pretrained_backbone, + input_shape=_cfg["input_shape"], include_top=False, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - kwargs['embedding_units'] = _cfg['embedding_units'] - kwargs['attention_units'] = _cfg['attention_units'] - kwargs['max_length'] = _cfg['max_length'] - kwargs['num_decoders'] = _cfg['num_decoders'] + kwargs["vocab"] = _cfg["vocab"] # Build the model model = SAR(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model -
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - -
-[docs] +[docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import sar_resnet31 + >>> model = sar_resnet31(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the SAR architecture Returns: + ------- text recognition architecture """ - - return _sar('sar_resnet31', pretrained, **kwargs)
+ return _sar("sar_resnet31", pretrained, resnet31, **kwargs)
@@ -702,8 +756,8 @@

Source code for doctr.models.recognition.sar.tensorflow

- +
+ diff --git a/v0.5.1/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.5.1/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/v0.5.1/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.5.1/_modules/doctr/models/recognition/zoo.html b/v0.5.1/_modules/doctr/models/recognition/zoo.html index bf0ae6af6e..f664304019 100644 --- a/v0.5.1/_modules/doctr/models/recognition/zoo.html +++ b/v0.5.1/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
-from doctr.file_utils import is_tf_available, is_torch_available
-from .core import RecognitionPredictor
-from ..preprocessor import PreProcessor
-from .. import recognition
+from doctr.file_utils import is_tf_available
+from doctr.models.preprocessor import PreProcessor
 
+from .. import recognition
+from .predictor import RecognitionPredictor
 
 __all__ = ["recognition_predictor"]
 
 
-if is_tf_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
-elif is_torch_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
+ARCHS: List[str] = [
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "sar_resnet31",
+    "master",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
+]
+
 
+def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+        _model = recognition.__dict__[arch](
+            pretrained=pretrained, pretrained_backbone=kwargs.get("pretrained_backbone", True)
+        )
+    else:
+        if not isinstance(
+            arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq)
+        ):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+        _model = arch
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+    kwargs.pop("pretrained_backbone", None)
 
-    _model = recognition.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 32)
-    predictor = RecognitionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
-        _model
-    )
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 128)
+    input_shape = _model.cfg["input_shape"][:2] if is_tf_available() else _model.cfg["input_shape"][-2:]
+    predictor = RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs), _model)
 
     return predictor
 
 
 
-[docs] -def recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor: +[docs] +def recognition_predictor( + arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + symmetric_pad: bool = False, + batch_size: int = 128, + **kwargs: Any, +) -> RecognitionPredictor: """Text recognition architecture. Example:: @@ -326,14 +369,18 @@

Source code for doctr.models.recognition.zoo

        >>> out = model([input_page])
 
     Args:
-        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
+    ----
+        arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
         pretrained: If True, returns a model pre-trained on our text recognition dataset
+        symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right
+        batch_size: number of samples the model processes in parallel
+        **kwargs: optional parameters to be passed to the architecture
 
     Returns:
+    -------
         Recognition predictor
     """
-
-    return _predictor(arch, pretrained, **kwargs)
+ return _predictor(arch=arch, pretrained=pretrained, symmetric_pad=symmetric_pad, batch_size=batch_size, **kwargs)
@@ -367,8 +414,8 @@

Source code for doctr.models.recognition.zoo

   
-
- +
+ diff --git a/v0.5.1/_modules/doctr/models/zoo.html b/v0.5.1/_modules/doctr/models/zoo.html index dec6857019..d459671648 100644 --- a/v0.5.1/_modules/doctr/models/zoo.html +++ b/v0.5.1/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.models.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from typing import Any
-from .core import OCRPredictor
+
 from .detection.zoo import detection_predictor
+from .kie_predictor import KIEPredictor
+from .predictor import OCRPredictor
 from .recognition.zoo import recognition_predictor
 
+__all__ = ["ocr_predictor", "kie_predictor"]
 
-__all__ = ["ocr_predictor"]
-
-
-def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
 
+def _predictor(
+    det_arch: Any,
+    reco_arch: Any,
+    pretrained: bool,
+    pretrained_backbone: bool = True,
+    assume_straight_pages: bool = True,
+    preserve_aspect_ratio: bool = True,
+    symmetric_pad: bool = True,
+    det_bs: int = 2,
+    reco_bs: int = 128,
+    detect_orientation: bool = False,
+    straighten_pages: bool = False,
+    detect_language: bool = False,
+    **kwargs,
+) -> OCRPredictor:
     # Detection
-    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
+    det_predictor = detection_predictor(
+        det_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=det_bs,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+    )
 
     # Recognition
-    reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
+    reco_predictor = recognition_predictor(
+        reco_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=reco_bs,
+    )
 
-    return OCRPredictor(det_predictor, reco_predictor)
+    return OCRPredictor(
+        det_predictor,
+        reco_predictor,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+        detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
+        detect_language=detect_language,
+        **kwargs,
+    )
 
 
 
-[docs] +[docs] def ocr_predictor( - det_arch: str = 'db_resnet50', - reco_arch: str = 'crnn_vgg16_bn', + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", pretrained: bool = False, - **kwargs: Any + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. - Example:: - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet') + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` Returns: + ------- OCR predictor """ + return _predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
+ + - return _predictor(det_arch, reco_arch, pretrained, **kwargs)
+def _kie_predictor( + det_arch: Any, + reco_arch: Any, + pretrained: bool, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + det_bs: int = 2, + reco_bs: int = 128, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs, +) -> KIEPredictor: + # Detection + det_predictor = detection_predictor( + det_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=det_bs, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + ) + + # Recognition + reco_predictor = recognition_predictor( + reco_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=reco_bs, + ) + + return KIEPredictor( + det_predictor, + reco_predictor, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + ) + + +
+[docs] +def kie_predictor( + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, +) -> KIEPredictor: + """End-to-end KIE architecture using one model for localization, and another for text recognition. + + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) + + Args: + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') + pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` + + Returns: + ------- + KIE predictor + """ + return _kie_predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
@@ -353,8 +575,8 @@

Source code for doctr.models.zoo

       
     
   
- - + + diff --git a/v0.5.1/_modules/doctr/transforms/modules.html b/v0.5.1/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.5.1/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/transforms/modules/base.html b/v0.5.1/_modules/doctr/transforms/modules/base.html index c42079a8fd..4596df3848 100644 --- a/v0.5.1/_modules/doctr/transforms/modules/base.html +++ b/v0.5.1/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.base

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
+import math
 import random
-from typing import List, Any, Callable
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import numpy as np
 
 from doctr.utils.repr import NestedObject
+
 from .. import functional as F
 
+__all__ = ["SampleCompose", "ImageTransform", "ColorInversion", "OneOf", "RandomApply", "RandomRotate", "RandomCrop"]
+
+
+class SampleCompose(NestedObject):
+    """Implements a wrapper that will apply transformations sequentially on both image and target
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfo = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), np.zeros((2, 4)))
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import torch
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfos = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfos(torch.rand(8, 64, 64, 3), np.zeros((2, 4)))
+
+    Args:
+    ----
+        transforms: list of transformation modules
+    """
+
+    _children_names: List[str] = ["sample_transforms"]
+
+    def __init__(self, transforms: List[Callable[[Any, Any], Tuple[Any, Any]]]) -> None:
+        self.sample_transforms = transforms
+
+    def __call__(self, x: Any, target: Any) -> Tuple[Any, Any]:
+        for t in self.sample_transforms:
+            x, target = t(x, target)
+
+        return x, target
+
+
+class ImageTransform(NestedObject):
+    """Implements a transform wrapper to turn an image-only transformation into an image+target transform
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), None)
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import torch
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(torch.rand(8, 64, 64, 3), None)
+
+    Args:
+    ----
+        transform: the image transformation module to wrap
+    """
+
+    _children_names: List[str] = ["img_transform"]
+
+    def __init__(self, transform: Callable[[Any], Any]) -> None:
+        self.img_transform = transform
 
-__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
+    def __call__(self, img: Any, target: Any) -> Tuple[Any, Any]:
+        img = self.img_transform(img)
+        return img, target
 
 
 
-[docs] +[docs] class ColorInversion(NestedObject): """Applies the following tranformation to a tensor (image or batch of images): convert to grayscale, colorize (shift 0-values randomly), and then invert colors - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(torch.rand(8, 64, 64, 3)) Args: + ---- min_val: range [min_val, 1] to colorize RGB pixels """ + def __init__(self, min_val: float = 0.5) -> None: self.min_val = min_val @@ -316,59 +437,178 @@

Source code for doctr.transforms.modules.base

-[docs] +[docs] class OneOf(NestedObject): """Randomly apply one of the input transformations - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transforms: list of transformations, one only will be picked """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: # Pick transformation transfo = self.transforms[int(random.random() * len(self.transforms))] # Apply - return transfo(img)
+ return transfo(img) if target is None else transfo(img, target) # type: ignore[call-arg]
-[docs] +[docs] class RandomApply(NestedObject): """Apply with a probability p the input transformation - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transform: transformation to apply p: probability to apply """ - def __init__(self, transform: Callable[[Any], Any], p: float = .5) -> None: + + def __init__(self, transform: Callable[[Any], Any], p: float = 0.5) -> None: self.transform = transform self.p = p def extra_repr(self) -> str: return f"transform={self.transform}, p={self.p}" - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: if random.random() < self.p: - return self.transform(img) - return img
+ return self.transform(img) if target is None else self.transform(img, target) # type: ignore[call-arg] + return img if target is None else (img, target)
+ + + +
+[docs] +class RandomRotate(NestedObject): + """Randomly rotate a tensor image and its boxes + + .. image:: https://doctr-static.mindee.com/models?id=v0.4.0/rotation_illustration.png&src=0 + :align: center + + Args: + ---- + max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in + [-max_angle, max_angle] + expand: whether the image should be padded before the rotation + """ + + def __init__(self, max_angle: float = 5.0, expand: bool = False) -> None: + self.max_angle = max_angle + self.expand = expand + + def extra_repr(self) -> str: + return f"max_angle={self.max_angle}, expand={self.expand}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + angle = random.uniform(-self.max_angle, self.max_angle) + r_img, r_polys = F.rotate_sample(img, target, angle, self.expand) + # Removes deleted boxes + is_kept = (r_polys.max(1) > r_polys.min(1)).sum(1) == 2 + return r_img, r_polys[is_kept]
+ + + +
+[docs] +class RandomCrop(NestedObject): + """Randomly crop a tensor image and its boxes + + Args: + ---- + scale: tuple of floats, relative (min_area, max_area) of the crop + ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w + """ + + def __init__(self, scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: + self.scale = scale + self.ratio = ratio + + def extra_repr(self) -> str: + return f"scale={self.scale}, ratio={self.ratio}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + scale = random.uniform(self.scale[0], self.scale[1]) + ratio = random.uniform(self.ratio[0], self.ratio[1]) + + height, width = img.shape[:2] + + # Calculate crop size + crop_area = scale * width * height + aspect_ratio = ratio * (width / height) + crop_width = int(round(math.sqrt(crop_area * aspect_ratio))) + crop_height = int(round(math.sqrt(crop_area / aspect_ratio))) + + # Ensure crop size does not exceed image dimensions + crop_width = min(crop_width, width) + crop_height = min(crop_height, height) + + # Randomly select crop position + x = random.randint(0, width - crop_width) + y = random.randint(0, height - crop_height) + + # relative crop box + crop_box = (x / width, y / height, (x + crop_width) / width, (y + crop_height) / height) + if target.shape[1:] == (4, 2): + min_xy = np.min(target, axis=1) + max_xy = np.max(target, axis=1) + _target = np.concatenate((min_xy, max_xy), axis=1) + else: + _target = target + + # Crop image and targets + croped_img, crop_boxes = F.crop_detection(img, _target, crop_box) + # hard fallback if no box is kept + if crop_boxes.shape[0] == 0: + return img, target + # clip boxes + return croped_img, np.clip(crop_boxes, 0, 1)
@@ -402,8 +642,8 @@

Source code for doctr.transforms.modules.base

- - + + diff --git a/v0.5.1/_modules/doctr/transforms/modules/tensorflow.html b/v0.5.1/_modules/doctr/transforms/modules/tensorflow.html index 1d192a876b..acbbe96225 100644 --- a/v0.5.1/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.5.1/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import random
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
 import tensorflow as tf
-from typing import List, Any, Tuple, Callable
 
 from doctr.utils.repr import NestedObject
 
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'RandomBrightness',
-           'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality']
+from ..functional.tensorflow import _gaussian_filter, random_shadow
+
+__all__ = [
+    "Compose",
+    "Resize",
+    "Normalize",
+    "LambdaTransformation",
+    "ToGray",
+    "RandomBrightness",
+    "RandomContrast",
+    "RandomSaturation",
+    "RandomHue",
+    "RandomGamma",
+    "RandomJpegQuality",
+    "GaussianBlur",
+    "ChannelShuffle",
+    "GaussianNoise",
+    "RandomHorizontalFlip",
+    "RandomShadow",
+    "RandomResize",
+]
 
 
 
-[docs] +[docs] class Compose(NestedObject): """Implements a wrapper that will apply transformations sequentially - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Compose, Resize + >>> transfos = Compose([Resize((32, 32))]) + >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- transforms: list of transformation modules """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms @@ -319,26 +361,27 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class Resize(NestedObject): """Resizes a tensor to a target size - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Resize + >>> transfo = Resize((32, 32)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- output_size: expected output size method: interpolation method preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically """ + def __init__( self, - output_size: Tuple[int, int], - method: str = 'bilinear', + output_size: Union[int, Tuple[int, int]], + method: str = "bilinear", preserve_aspect_ratio: bool = False, symmetric_pad: bool = False, ) -> None: @@ -346,6 +389,14 @@

Source code for doctr.transforms.modules.tensorflow

self.method = method self.preserve_aspect_ratio = preserve_aspect_ratio self.symmetric_pad = symmetric_pad + self.antialias = True + + if isinstance(self.output_size, int): + self.wanted_size = (self.output_size, self.output_size) + elif isinstance(self.output_size, (tuple, list)): + self.wanted_size = self.output_size + else: + raise AssertionError("Output size should be either a list, a tuple or an int") def extra_repr(self) -> str: _repr = f"output_size={self.output_size}, method='{self.method}'" @@ -353,64 +404,106 @@

Source code for doctr.transforms.modules.tensorflow

_repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" return _repr - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) + def __call__( + self, + img: tf.Tensor, + target: Optional[np.ndarray] = None, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: + input_dtype = img.dtype + self.output_size = ( + (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size + ) + + img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias) + # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio + raw_shape = img.shape[:2] + if self.symmetric_pad: + half_pad = (int((self.output_size[0] - img.shape[0]) / 2), 0) if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
+ if isinstance(self.output_size, (tuple, list)): + # In that case we need to pad because we want to enforce both width and height + if not self.symmetric_pad: + half_pad = (0, 0) + elif self.output_size[0] == img.shape[0]: + half_pad = (0, int((self.output_size[1] - img.shape[1]) / 2)) + # Pad image + img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size) + + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) + if target is not None: + if self.symmetric_pad: + offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1] + + if self.preserve_aspect_ratio: + # Get absolute coords + if target.shape[1:] == (4,): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1] + target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0] + else: + target[:, [0, 2]] *= raw_shape[1] / img.shape[1] + target[:, [1, 3]] *= raw_shape[0] / img.shape[0] + elif target.shape[1:] == (4, 2): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1] + target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0] + else: + target[..., 0] *= raw_shape[1] / img.shape[1] + target[..., 1] *= raw_shape[0] / img.shape[0] + else: + raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") + + return tf.cast(img, dtype=input_dtype), np.clip(target, 0, 1) + + return tf.cast(img, dtype=input_dtype)
-[docs] +[docs] class Normalize(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Normalize + >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- mean: average value per channel std: standard deviation per channel """ + def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) + self.mean = tf.constant(mean) + self.std = tf.constant(std) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std + img -= tf.cast(self.mean, dtype=img.dtype) + img /= tf.cast(self.std, dtype=img.dtype) return img
-[docs] +[docs] class LambdaTransformation(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import LambdaTransformation + >>> transfo = LambdaTransformation(lambda x: x/ 255.) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- fn: the function to be applied to the input tensor """ + def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: self.fn = fn @@ -420,37 +513,42 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class ToGray(NestedObject): """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import ToGray + >>> transfo = ToGray() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) """ + + def __init__(self, num_output_channels: int = 1): + self.num_output_channels = num_output_channels + def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
+ img = tf.image.rgb_to_grayscale(img) + return img if self.num_output_channels == 1 else tf.repeat(img, self.num_output_channels, axis=-1)
-[docs] +[docs] class RandomBrightness(NestedObject): """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta to all pixels - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomBrightness + >>> transfo = RandomBrightness() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] p: probability to apply transformation """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -463,21 +561,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomContrast(NestedObject): """Randomly adjust contrast of a tensor (batch of images or image) by adjusting each pixel: (img - mean) * contrast_factor + mean. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomContrast + >>> transfo = RandomContrast() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) """ - def __init__(self, delta: float = .3) -> None: + + def __init__(self, delta: float = 0.3) -> None: self.delta = delta def extra_repr(self) -> str: @@ -489,21 +588,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomSaturation(NestedObject): """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and increasing saturation by a factor. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomSaturation + >>> transfo = RandomSaturation() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) """ - def __init__(self, delta: float = .5) -> None: + + def __init__(self, delta: float = 0.5) -> None: self.delta = delta def extra_repr(self) -> str: @@ -515,19 +615,20 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomHue(NestedObject): """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHue + >>> transfo = RandomHue() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -540,22 +641,23 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomGamma(NestedObject): """randomly performs gamma correction for a tensor (batch of images or image) - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomGamma + >>> transfo = RandomGamma() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- min_gamma: non-negative real number, lower bound for gamma param max_gamma: non-negative real number, upper bound for gamma min_gain: lower bound for constant multiplier max_gain: upper bound for constant multiplier """ + def __init__( self, min_gamma: float = 0.5, @@ -580,20 +682,21 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomJpegQuality(NestedObject): """Randomly adjust jpeg quality of a 3 dimensional RGB image - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomJpegQuality + >>> transfo = RandomJpegQuality() + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- min_quality: int between [0, 100] max_quality: int between [0, 100] """ + def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: self.min_quality = min_quality self.max_quality = max_quality @@ -602,10 +705,224 @@

Source code for doctr.transforms.modules.tensorflow

return f"min_quality={self.min_quality}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality + return tf.image.random_jpeg_quality(img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality)
+ + + +
+[docs] +class GaussianBlur(NestedObject): + """Randomly adjust jpeg quality of a 3 dimensional RGB image + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianBlur + >>> transfo = GaussianBlur(3, (.1, 5)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + kernel_shape: size of the blurring kernel + std: min and max value of the standard deviation + """ + + def __init__(self, kernel_shape: Union[int, Iterable[int]], std: Tuple[float, float]) -> None: + self.kernel_shape = kernel_shape + self.std = std + + def extra_repr(self) -> str: + return f"kernel_shape={self.kernel_shape}, std={self.std}" + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.squeeze( + _gaussian_filter( + img[tf.newaxis, ...], + kernel_size=self.kernel_shape, + sigma=random.uniform(self.std[0], self.std[1]), + mode="REFLECT", + ), + axis=0, )
+ + +
+[docs] +class ChannelShuffle(NestedObject): + """Randomly shuffle channel order of a given image""" + + def __init__(self): + pass + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.transpose(tf.random.shuffle(tf.transpose(img, perm=[2, 0, 1])), perm=[1, 2, 0])
+ + + +
+[docs] +class GaussianNoise(NestedObject): + """Adds Gaussian Noise to the input tensor + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianNoise + >>> transfo = GaussianNoise(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + mean : mean of the gaussian distribution + std : std of the gaussian distribution + """ + + def __init__(self, mean: float = 0.0, std: float = 1.0) -> None: + super().__init__() + self.std = std + self.mean = mean + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + noise = self.mean + 2 * self.std * tf.random.uniform(x.shape) - self.std + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value(tf.math.round(tf.cast(x, dtype=tf.float32) + 255 * noise), 0, 255), dtype=tf.uint8 + ) + else: + return tf.cast(tf.clip_by_value(x + noise, 0, 1), dtype=x.dtype) + + def extra_repr(self) -> str: + return f"mean={self.mean}, std={self.std}"
+ + + +
+[docs] +class RandomHorizontalFlip(NestedObject): + """Adds random horizontal flip to the input tensor/np.ndarray + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHorizontalFlip + >>> transfo = RandomHorizontalFlip(p=0.5) + >>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1) + >>> target = np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32) + >>> out = transfo(image, target) + + Args: + ---- + p : probability of Horizontal Flip + """ + + def __init__(self, p: float) -> None: + super().__init__() + self.p = p + + def __call__(self, img: Union[tf.Tensor, np.ndarray], target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + _img = tf.image.flip_left_right(img) + _target = target.copy() + # Changing the relative bbox coordinates + if target.shape[1:] == (4,): + _target[:, ::2] = 1 - target[:, [2, 0]] + else: + _target[..., 0] = 1 - target[..., 0] + return _img, _target + return img, target
+ + + +
+[docs] +class RandomShadow(NestedObject): + """Adds random shade to the input image + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomShadow + >>> transfo = RandomShadow(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + opacity_range : minimum and maximum opacity of the shade + """ + + def __init__(self, opacity_range: Optional[Tuple[float, float]] = None) -> None: + super().__init__() + self.opacity_range = opacity_range if isinstance(opacity_range, tuple) else (0.2, 0.8) + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value( + tf.math.round(255 * random_shadow(tf.cast(x, dtype=tf.float32) / 255, self.opacity_range)), + 0, + 255, + ), + dtype=tf.uint8, + ) + else: + return tf.clip_by_value(random_shadow(x, self.opacity_range), 0, 1) + + def extra_repr(self) -> str: + return f"opacity_range={self.opacity_range}"
+ + + +
+[docs] +class RandomResize(NestedObject): + """Randomly resize the input image and align corresponding targets + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomResize + >>> transfo = RandomResize((0.3, 0.9), preserve_aspect_ratio=True, symmetric_pad=True, p=0.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + scale_range: range of the resizing factor for width and height (independently) + preserve_aspect_ratio: whether to preserve the aspect ratio of the image, + given a float value, the aspect ratio will be preserved with this probability + symmetric_pad: whether to symmetrically pad the image, + given a float value, the symmetric padding will be applied with this probability + p: probability to apply the transformation + """ + + def __init__( + self, + scale_range: Tuple[float, float] = (0.3, 0.9), + preserve_aspect_ratio: Union[bool, float] = False, + symmetric_pad: Union[bool, float] = False, + p: float = 0.5, + ): + super().__init__() + self.scale_range = scale_range + self.preserve_aspect_ratio = preserve_aspect_ratio + self.symmetric_pad = symmetric_pad + self.p = p + self._resize = Resize + + def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + scale_h = random.uniform(*self.scale_range) + scale_w = random.uniform(*self.scale_range) + new_size = (int(img.shape[-3] * scale_h), int(img.shape[-2] * scale_w)) + + _img, _target = self._resize( + new_size, + preserve_aspect_ratio=self.preserve_aspect_ratio + if isinstance(self.preserve_aspect_ratio, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + symmetric_pad=self.symmetric_pad + if isinstance(self.symmetric_pad, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + )(img, target) + + return _img, _target + return img, target + + def extra_repr(self) -> str: + return f"scale_range={self.scale_range}, preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}, p={self.p}" # noqa: E501
+
@@ -638,8 +955,8 @@

Source code for doctr.transforms.modules.tensorflow

- +
+ diff --git a/v0.5.1/_modules/doctr/utils/metrics.html b/v0.5.1/_modules/doctr/utils/metrics.html index 460c64a385..8a37d5949a 100644 --- a/v0.5.1/_modules/doctr/utils/metrics.html +++ b/v0.5.1/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.metrics

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-import cv2
-from typing import List, Tuple, Dict, Optional
-from unidecode import unidecode
+from anyascii import anyascii
 from scipy.optimize import linear_sum_assignment
-from doctr.utils.geometry import rbbox_to_polygon
+from shapely.geometry import Polygon
 
-__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'rbox_to_mask',
-           'nms', 'LocalizationConfusion', 'OCRMetric']
+__all__ = [
+    "TextMatch",
+    "box_iou",
+    "polygon_iou",
+    "nms",
+    "LocalizationConfusion",
+    "OCRMetric",
+    "DetectionMetric",
+]
 
 
 def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
-    """Perform string comparison with multiple levels of tolerance
+    """Performs string comparison with multiple levels of tolerance
 
     Args:
+    ----
         word1: a string
         word2: another string
 
     Returns:
+    -------
         a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
-            unidecode counterparts and their lower-case unidecode counterparts match
+            anyascii counterparts and their lower-case anyascii counterparts match
     """
-    raw_match = (word1 == word2)
-    caseless_match = (word1.lower() == word2.lower())
-    unidecode_match = (unidecode(word1) == unidecode(word2))
+    raw_match = word1 == word2
+    caseless_match = word1.lower() == word2.lower()
+    anyascii_match = anyascii(word1) == anyascii(word2)
 
     # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
-    unicase_match = (unidecode(word1).lower() == unidecode(word2).lower())
+    unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
 
-    return raw_match, caseless_match, unidecode_match, unicase_match
+    return raw_match, caseless_match, anyascii_match, unicase_match
 
 
 
-[docs] +[docs] class TextMatch: - """Implements text match metric (word-level accuracy) for recognition task. + r"""Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \\forall X, Y \\in \\mathcal{W}^N, - TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) + \forall X, Y \in \mathcal{W}^N, + TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \\forall a, x \\in \\mathcal{W}, - f_a(x) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } x = a \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{W}` is the set of all possible character sequences, + \forall a, x \in \mathcal{W}, + f_a(x) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } x = a \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. - Example:: - >>> from doctr.utils import TextMatch - >>> metric = TextMatch() - >>> metric.update(['Hello', 'world'], ['hello', 'world']) - >>> metric.summary() + >>> from doctr.utils import TextMatch + >>> metric = TextMatch() + >>> metric.update(['Hello', 'world'], ['hello', 'world']) + >>> metric.summary() """ def __init__(self) -> None: self.reset() +
+[docs] def update( self, gt: List[str], @@ -354,29 +386,32 @@

Source code for doctr.utils.metrics

         """Update the state of the metric with new predictions
 
         Args:
+        ----
             gt: list of groung-truth character sequences
-            pred: list of predicted character sequences"""
-
+            pred: list of predicted character sequences
+        """
         if len(gt) != len(pred):
             raise AssertionError("prediction size does not match with ground-truth labels size")
 
         for gt_word, pred_word in zip(gt, pred):
-            _raw, _caseless, _unidecode, _unicase = string_match(gt_word, pred_word)
+            _raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
             self.raw += int(_raw)
             self.caseless += int(_caseless)
-            self.unidecode += int(_unidecode)
+            self.anyascii += int(_anyascii)
             self.unicase += int(_unicase)
 
-        self.total += len(gt)
+        self.total += len(gt)
+
-[docs] +[docs] def summary(self) -> Dict[str, float]: """Computes the aggregated metrics - Returns: - a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode - counterpart and its lower-case unidecode counterpart + Returns + ------- + a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii + counterpart and its lower-case anyascii counterpart """ if self.total == 0: raise AssertionError("you need to update the metric before getting the summary") @@ -384,7 +419,7 @@

Source code for doctr.utils.metrics

         return dict(
             raw=self.raw / self.total,
             caseless=self.caseless / self.total,
-            unidecode=self.unidecode / self.total,
+            anyascii=self.anyascii / self.total,
             unicase=self.unicase / self.total,
         )
@@ -392,23 +427,25 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.raw = 0
         self.caseless = 0
-        self.unidecode = 0
+        self.anyascii = 0
         self.unicase = 0
         self.total = 0
def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray: - """Compute the IoU between two sets of bounding boxes + """Computes the IoU between two sets of bounding boxes Args: + ---- boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax) boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax) + Returns: + ------- the IoU matrix of shape (N, M) """ - - iou_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) + iou_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0: l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1) @@ -419,107 +456,54 @@

Source code for doctr.utils.metrics

         right = np.minimum(r1, r2.T)
         bot = np.minimum(b1, b2.T)
 
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
+        intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
         union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
         iou_mat = intersection / union
 
     return iou_mat
 
 
-def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
-    """Compute the IoA (intersection over area) between two sets of bounding boxes:
-    ioa(i, j) = inter(i, j) / area(i)
-
-    Args:
-        boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
-        boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
-    Returns:
-        the IoA matrix of shape (N, M)
-    """
-
-    ioa_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
-
-    if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
-        l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
-        l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
-
-        left = np.maximum(l1, l2.T)
-        top = np.maximum(t1, t2.T)
-        right = np.minimum(r1, r2.T)
-        bot = np.minimum(b1, b2.T)
-
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
-        area = (r1 - l1) * (b1 - t1)
-        ioa_mat = intersection / area
-
-    return ioa_mat
-
-
-def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
-    """Compute the IoU between two sets of boolean masks
+def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
+    """Computes the IoU between two sets of rotated bounding boxes
 
     Args:
-        masks_1: boolean masks of shape (N, H, W)
-        masks_2: boolean masks of shape (M, H, W)
+    ----
+        polys_1: rotated bounding boxes of shape (N, 4, 2)
+        polys_2: rotated bounding boxes of shape (M, 4, 2)
+        mask_shape: spatial shape of the intermediate masks
+        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
 
     Returns:
+    -------
         the IoU matrix of shape (N, M)
     """
+    if polys_1.ndim != 3 or polys_2.ndim != 3:
+        raise AssertionError("expects boxes to be in format (N, 4, 2)")
 
-    if masks_1.shape[1:] != masks_2.shape[1:]:
-        raise AssertionError("both boolean masks should have the same spatial shape")
+    iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
 
-    iou_mat = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
+    shapely_polys_1 = [Polygon(poly) for poly in polys_1]
+    shapely_polys_2 = [Polygon(poly) for poly in polys_2]
 
-    if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
-        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...])
-        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...])
-        axes = tuple(range(2, masks_1.ndim + 1))
-        iou_mat = intersection.sum(axis=axes) / union.sum(axis=axes)
+    for i, poly1 in enumerate(shapely_polys_1):
+        for j, poly2 in enumerate(shapely_polys_2):
+            intersection_area = poly1.intersection(poly2).area
+            union_area = poly1.area + poly2.area - intersection_area
+            iou_mat[i, j] = intersection_area / union_area
 
     return iou_mat
 
 
-def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
-    """Convert boxes to masks
-
-    Args:
-        boxes: rotated bounding boxes of shape (N, 5) in format (x, y, w, h, alpha)
-        shape: spatial shapes of the output masks
-
-    Returns:
-        the boolean masks of shape (N, H, W)
-    """
-
-    masks = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
-
-    if boxes.shape[0] > 0:
-        # Get absolute coordinates
-        if boxes.dtype != np.int:
-            abs_boxes = boxes.copy()
-            abs_boxes[:, [0, 2]] = abs_boxes[:, [0, 2]] * shape[1]
-            abs_boxes[:, [1, 3]] = abs_boxes[:, [1, 3]] * shape[0]
-            abs_boxes = abs_boxes.round().astype(np.int)
-        else:
-            abs_boxes = boxes
-            abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
-
-        # TODO: optimize slicing to improve vectorization
-        for idx, _box in enumerate(abs_boxes):
-            box = rbbox_to_polygon(_box)
-            cv2.fillPoly(masks[idx], [np.array(box, np.int32)], 1)
-
-    return masks.astype(bool)
-
-
-def nms(boxes: np.ndarray, thresh: float = .5) -> List[int]:
+def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
     """Perform non-max suppression, borrowed from <https://github.com/rbgirshick/fast-rcnn>`_.
 
     Args:
+    ----
         boxes: np array of straight boxes: (*, 5), (xmin, ymin, xmax, ymax, score)
         thresh: iou threshold to perform box suppression.
 
     Returns:
+    -------
         A list of box indexes to keep
     """
     x1 = boxes[:, 0]
@@ -551,66 +535,71 @@ 

Source code for doctr.utils.metrics

 
 
 
-[docs] +[docs] class LocalizationConfusion: - """Implements common confusion metrics and mean IoU for localization evaluation. + r"""Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ - Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) + \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ + Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ + Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ + meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \\forall y \\in \\mathcal{B}, - g_X(y) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, + \forall y \in \mathcal{B}, + g_X(y) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import LocalizationConfusion - >>> metric = LocalizationConfusion(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import LocalizationConfusion + >>> metric = LocalizationConfusion(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: + """Updates the metric + Args: + ---- + gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + """ if preds.shape[0] > 0: # Compute IoU - if self.rotated_bbox: - mask_gts = rbox_to_mask(gts, shape=self.mask_shape) - mask_preds = rbox_to_mask(preds, shape=self.mask_shape) - iou_mat = mask_iou(mask_gts, mask_preds) + if self.use_polygons: + iou_mat = polygon_iou(gts, preds) else: iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=1).sum()) + self.tot_iou += float(iou_mat.max(axis=0).sum()) # Assign pairs gt_indices, pred_indices = linear_sum_assignment(-iou_mat) @@ -618,17 +607,18 @@

Source code for doctr.utils.metrics

 
         # Update counts
         self.num_gts += gts.shape[0]
-        self.num_preds += preds.shape[0]
+        self.num_preds += preds.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: """Computes the aggregated metrics - Returns: + Returns + ------- a tuple with the recall, precision and meanIoU scores """ - # Recall recall = self.matches / self.num_gts if self.num_gts > 0 else None @@ -636,7 +626,7 @@

Source code for doctr.utils.metrics

         precision = self.matches / self.num_preds if self.num_preds > 0 else None
 
         # mean IoU
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -645,64 +635,65 @@

Source code for doctr.utils.metrics

         self.num_gts = 0
         self.num_preds = 0
         self.matches = 0
-        self.tot_iou = 0.
+ self.tot_iou = 0.0
-[docs] +[docs] class OCRMetric: - """Implements end-to-end OCR metric. + r"""Implements an end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, - \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ - Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) + \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, + \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ + Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, - h_{B,L}(b, l) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ - & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, - :math:`\\mathcal{L}` is the set of possible character sequences, + \forall (b, l) \in \mathcal{B} \times \mathcal{L}, + h_{B,L}(b, l) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import OCRMetric - >>> metric = OCRMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - ['hello'], ['hello', 'world']) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import OCRMetric + >>> metric = OCRMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> ['hello'], ['hello', 'world']) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update( self, gt_boxes: np.ndarray, @@ -710,50 +701,58 @@

Source code for doctr.utils.metrics

         gt_labels: List[str],
         pred_labels: List[str],
     ) -> None:
+        """Updates the metric
 
+        Args:
+        ----
+            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
+            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
+            gt_labels: a list of N string labels
+            pred_labels: a list of M string labels
+        """
         if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
-            raise AssertionError("there should be the same number of boxes and string both for the ground truth "
-                                 "and the predictions")
+            raise AssertionError(
+                "there should be the same number of boxes and string both for the ground truth and the predictions"
+            )
 
         # Compute IoU
         if pred_boxes.shape[0] > 0:
-            if self.rotated_bbox:
-                mask_gts = rbox_to_mask(gt_boxes, shape=self.mask_shape)
-                mask_preds = rbox_to_mask(pred_boxes, shape=self.mask_shape)
-                iou_mat = mask_iou(mask_gts, mask_preds)
+            if self.use_polygons:
+                iou_mat = polygon_iou(gt_boxes, pred_boxes)
             else:
                 iou_mat = box_iou(gt_boxes, pred_boxes)
 
-            self.tot_iou += float(iou_mat.max(axis=1).sum())
+            self.tot_iou += float(iou_mat.max(axis=0).sum())
 
             # Assign pairs
             gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
             is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
             # String comparison
             for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
-                _raw, _caseless, _unidecode, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
+                _raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
                 self.raw_matches += int(_raw)
                 self.caseless_matches += int(_caseless)
-                self.unidecode_matches += int(_unidecode)
+                self.anyascii_matches += int(_anyascii)
                 self.unicase_matches += int(_unicase)
 
         self.num_gts += gt_boxes.shape[0]
-        self.num_preds += pred_boxes.shape[0]
+        self.num_preds += pred_boxes.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float]], Optional[float]]: """Computes the aggregated metrics - Returns: - a tuple with the recall & precision for each string comparison flexibility and the mean IoU + Returns + ------- + a tuple with the recall & precision for each string comparison and the mean IoU """ - # Recall recall = dict( raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None, caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None, - unidecode=self.unidecode_matches / self.num_gts if self.num_gts > 0 else None, + anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None, unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None, ) @@ -761,12 +760,12 @@

Source code for doctr.utils.metrics

         precision = dict(
             raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
             caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
-            unidecode=self.unidecode_matches / self.num_preds if self.num_preds > 0 else None,
+            anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
             unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
         )
 
         # mean IoU (overall detected boxes)
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -774,12 +773,136 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.num_gts = 0
         self.num_preds = 0
-        self.tot_iou = 0.
+        self.tot_iou = 0.0
         self.raw_matches = 0
         self.caseless_matches = 0
-        self.unidecode_matches = 0
+        self.anyascii_matches = 0
         self.unicase_matches = 0
+ + +
+[docs] +class DetectionMetric: + r"""Implements an object detection metric. + + The aggregated metrics are computed as follows: + + .. math:: + \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, + \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ + Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + + with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and + :math:`y`, and the function :math:`h_{B, C}` defined as: + + .. math:: + \forall (b, c) \in \mathcal{B} \times \mathcal{C}, + h_{B,C}(b, c) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{C}` is the set of possible class indices, + :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. + + >>> import numpy as np + >>> from doctr.utils import DetectionMetric + >>> metric = DetectionMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) + >>> metric.summary() + + Args: + ---- + iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format + """ + + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: + self.iou_thresh = iou_thresh + self.use_polygons = use_polygons + self.reset() + +
+[docs] + def update( + self, + gt_boxes: np.ndarray, + pred_boxes: np.ndarray, + gt_labels: np.ndarray, + pred_labels: np.ndarray, + ) -> None: + """Updates the metric + + Args: + ---- + gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + gt_labels: an array of class indices of shape (N,) + pred_labels: an array of class indices of shape (M,) + """ + if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: + raise AssertionError( + "there should be the same number of boxes and string both for the ground truth and the predictions" + ) + + # Compute IoU + if pred_boxes.shape[0] > 0: + if self.use_polygons: + iou_mat = polygon_iou(gt_boxes, pred_boxes) + else: + iou_mat = box_iou(gt_boxes, pred_boxes) + + self.tot_iou += float(iou_mat.max(axis=0).sum()) + + # Assign pairs + gt_indices, pred_indices = linear_sum_assignment(-iou_mat) + is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh + # Category comparison + self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) + + self.num_gts += gt_boxes.shape[0] + self.num_preds += pred_boxes.shape[0]
+ + +
+[docs] + def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: + """Computes the aggregated metrics + + Returns + ------- + a tuple with the recall & precision for each class prediction and the mean IoU + """ + # Recall + recall = self.num_matches / self.num_gts if self.num_gts > 0 else None + + # Precision + precision = self.num_matches / self.num_preds if self.num_preds > 0 else None + + # mean IoU (overall detected boxes) + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None + + return recall, precision, mean_iou
+ + + def reset(self) -> None: + self.num_gts = 0 + self.num_preds = 0 + self.tot_iou = 0.0 + self.num_matches = 0
+
@@ -812,8 +935,8 @@

Source code for doctr.utils.metrics

       
     
   
- - + + diff --git a/v0.5.1/_modules/doctr/utils/visualization.html b/v0.5.1/_modules/doctr/utils/visualization.html index 8e7dcca811..c818be6d7b 100644 --- a/v0.5.1/_modules/doctr/utils/visualization.html +++ b/v0.5.1/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.visualization

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import colorsys
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import matplotlib.pyplot as plt
-from matplotlib.figure import Figure
+import cv2
 import matplotlib.patches as patches
-import mplcursors
-from PIL import ImageFont, ImageDraw, Image
+import matplotlib.pyplot as plt
 import numpy as np
-import cv2
-from typing import Tuple, List, Dict, Any, Union
+from matplotlib.figure import Figure
 
-from .common_types import BoundingBox, RotatedBbox
+from .common_types import BoundingBox, Polygon4P
 
-__all__ = ['visualize_page', 'synthetize_page']
+__all__ = ["visualize_page", "visualize_kie_page", "draw_boxes"]
 
 
-def create_rect_patch(
-    geometry: Union[BoundingBox, RotatedBbox],
-    label: str,
+def rect_patch(
+    geometry: BoundingBox,
     page_dimensions: Tuple[int, int],
-    color: Tuple[int, int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
     alpha: float = 0.3,
     linewidth: int = 2,
     fill: bool = True,
-) -> patches.Patch:
-    """Create a matplotlib patch (rectangle) bounding the element
+    preserve_aspect_ratio: bool = False,
+) -> patches.Rectangle:
+    """Create a matplotlib rectangular patch for the element
 
     Args:
+    ----
         geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
         label: label to display when hovered
-        page_dimensions: dimensions of the Page
         color: color to draw box
         alpha: opacity parameter to fill the boxes, 0 = transparent
         linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
 
     Returns:
+    -------
         a rectangular Patch
     """
+    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
     height, width = page_dimensions
-    if len(geometry) == 5:
-        x, y, w, h, a = geometry  # type: ignore[misc]
-        x, w = x * width, w * width
-        y, h = y * height, h * height
-        points = cv2.boxPoints(((x, y), (w, h), a))
-        return patches.Polygon(
-            points,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
-    else:
-        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
-        xmin, xmax = xmin * width, xmax * width
-        ymin, ymax = ymin * height, ymax * height
-        return patches.Rectangle(
-            (xmin, ymin),
-            xmax - xmin,
-            ymax - ymin,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
+    (xmin, ymin), (xmax, ymax) = geometry
+    # Switch to absolute coords
+    if preserve_aspect_ratio:
+        width = height = max(height, width)
+    xmin, w = xmin * width, (xmax - xmin) * width
+    ymin, h = ymin * height, (ymax - ymin) * height
+
+    return patches.Rectangle(
+        (xmin, ymin),
+        w,
+        h,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def polygon_patch(
+    geometry: np.ndarray,
+    page_dimensions: Tuple[int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
+    alpha: float = 0.3,
+    linewidth: int = 2,
+    fill: bool = True,
+    preserve_aspect_ratio: bool = False,
+) -> patches.Polygon:
+    """Create a matplotlib polygon patch for the element
+
+    Args:
+    ----
+        geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
+        label: label to display when hovered
+        color: color to draw box
+        alpha: opacity parameter to fill the boxes, 0 = transparent
+        linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
+
+    Returns:
+    -------
+        a polygon Patch
+    """
+    if not geometry.shape == (4, 2):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
+    height, width = page_dimensions
+    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
+    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
+
+    return patches.Polygon(
+        geometry,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def create_obj_patch(
+    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
+    page_dimensions: Tuple[int, int],
+    **kwargs: Any,
+) -> patches.Patch:
+    """Create a matplotlib patch for the element
+
+    Args:
+    ----
+        geometry: bounding box (straight or rotated) of the element
+        page_dimensions: dimensions of the page in format (height, width)
+        **kwargs: keyword arguments for the patch
+
+    Returns:
+    -------
+        a matplotlib Patch
+    """
+    if isinstance(geometry, tuple):
+        if len(geometry) == 2:  # straight word BB (2 pts)
+            return rect_patch(geometry, page_dimensions, **kwargs)
+        elif len(geometry) == 4:  # rotated word BB (4 pts)
+            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)
+    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
+        return polygon_patch(geometry, page_dimensions, **kwargs)
+    raise ValueError("invalid geometry format")
+
+
+def get_colors(num_colors: int) -> List[Tuple[float, float, float]]:
+    """Generate num_colors color for matplotlib
+
+    Args:
+    ----
+        num_colors: number of colors to generate
+
+    Returns:
+    -------
+        colors: list of generated colors
+    """
+    colors = []
+    for i in np.arange(0.0, 360.0, 360.0 / num_colors):
+        hue = i / 360.0
+        lightness = (50 + np.random.rand() * 10) / 100.0
+        saturation = (90 + np.random.rand() * 10) / 100.0
+        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
+    return colors
 
 
 
-[docs] +[docs] def visualize_page( page: Dict[str, Any], image: np.ndarray, @@ -359,18 +472,18 @@

Source code for doctr.utils.visualization

 ) -> Figure:
     """Visualize a full page with predicted blocks, lines and words
 
-    Example::
-        >>> import numpy as np
-        >>> import matplotlib.pyplot as plt
-        >>> from doctr.utils.visualization import visualize_page
-        >>> from doctr.models import ocr_db_crnn
-        >>> model = ocr_db_crnn(pretrained=True)
-        >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
-        >>> out = model([[input_page]])
-        >>> visualize_page(out[0].pages[0].export(), input_page)
-        >>> plt.show()
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from doctr.utils.visualization import visualize_page
+    >>> from doctr.models import ocr_db_crnn
+    >>> model = ocr_db_crnn(pretrained=True)
+    >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
+    >>> out = model([[input_page]])
+    >>> visualize_page(out[0].pages[0].export(), input_page)
+    >>> plt.show()
 
     Args:
+    ----
         page: the exported Page of a Document
         image: np array of the page, needs to have the same shape than page['dimensions']
         words_only: whether only words should be displayed
@@ -378,6 +491,11 @@ 

Source code for doctr.utils.visualization

         scale: figsize of the largest windows side
         interactive: whether the plot should be interactive
         add_labels: for static plot, adds text labels on top of bounding box
+        **kwargs: keyword arguments for the polygon patch
+
+    Returns:
+    -------
+        the matplotlib figure
     """
     # Get proper scale and aspect ratio
     h, w = image.shape[:2]
@@ -386,128 +504,189 @@ 

Source code for doctr.utils.visualization

     # Display the image
     ax.imshow(image)
     # hide both axis
-    ax.axis('off')
+    ax.axis("off")
 
     if interactive:
         artists: List[patches.Patch] = []  # instantiate an empty list of patches (to be drawn on the page)
 
-    for block in page['blocks']:
+    for block in page["blocks"]:
         if not words_only:
-            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
+            rect = create_obj_patch(
+                block["geometry"], page["dimensions"], label="block", color=(0, 1, 0), linewidth=1, **kwargs
+            )
             # add patch on figure
             ax.add_patch(rect)
             if interactive:
                 # add patch to cursor's artists
                 artists.append(rect)
 
-        for line in block['lines']:
+        for line in block["lines"]:
             if not words_only:
-                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
+                rect = create_obj_patch(
+                    line["geometry"], page["dimensions"], label="line", color=(1, 0, 0), linewidth=1, **kwargs
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
-            for word in line['words']:
-                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
-                                         page['dimensions'], (0, 0, 1), **kwargs)
+            for word in line["words"]:
+                rect = create_obj_patch(
+                    word["geometry"],
+                    page["dimensions"],
+                    label=f"{word['value']} (confidence: {word['confidence']:.2%})",
+                    color=(0, 0, 1),
+                    **kwargs,
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
                 elif add_labels:
-                    if len(word['geometry']) == 5:
+                    if len(word["geometry"]) == 5:
                         text_loc = (
-                            int(page['dimensions'][1] * (word['geometry'][0] - word['geometry'][2] / 2)),
-                            int(page['dimensions'][0] * (word['geometry'][1] - word['geometry'][3] / 2))
+                            int(page["dimensions"][1] * (word["geometry"][0] - word["geometry"][2] / 2)),
+                            int(page["dimensions"][0] * (word["geometry"][1] - word["geometry"][3] / 2)),
                         )
                     else:
                         text_loc = (
-                            int(page['dimensions'][1] * word['geometry'][0][0]),
-                            int(page['dimensions'][0] * word['geometry'][0][1])
+                            int(page["dimensions"][1] * word["geometry"][0][0]),
+                            int(page["dimensions"][0] * word["geometry"][0][1]),
+                        )
+
+                    if len(word["geometry"]) == 2:
+                        # We draw only if boxes are in straight format
+                        ax.text(
+                            *text_loc,
+                            word["value"],
+                            size=10,
+                            alpha=0.5,
+                            color=(0, 0, 1),
                         )
-                    ax.text(
-                        *text_loc,
-                        word['value'],
-                        size=10,
-                        alpha=0.5,
-                        color=(0, 0, 1),
-                    )
 
         if display_artefacts:
-            for artefact in block['artefacts']:
-                rect = create_rect_patch(
-                    artefact['geometry'],
-                    'artefact',
-                    page['dimensions'],
-                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
+            for artefact in block["artefacts"]:
+                rect = create_obj_patch(
+                    artefact["geometry"],
+                    page["dimensions"],
+                    label="artefact",
+                    color=(0.5, 0.5, 0.5),
                     linewidth=1,
-                    **kwargs
+                    **kwargs,
                 )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
     if interactive:
+        import mplcursors
+
         # Create mlp Cursor to hover patches in artists
         mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
-    fig.tight_layout(pad=0.)
+    fig.tight_layout(pad=0.0)
 
     return fig
-def synthetize_page( +def visualize_kie_page( page: Dict[str, Any], - draw_proba: bool = False, - font_size: int = 13, -) -> np.ndarray: - """Draw a the content of the element page (OCR response) on a blank page. + image: np.ndarray, + words_only: bool = False, + display_artefacts: bool = True, + scale: float = 10, + interactive: bool = True, + add_labels: bool = True, + **kwargs: Any, +) -> Figure: + """Visualize a full page with predicted blocks, lines and words + + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from doctr.utils.visualization import visualize_page + >>> from doctr.models import ocr_db_crnn + >>> model = ocr_db_crnn(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([[input_page]]) + >>> visualize_kie_page(out[0].pages[0].export(), input_page) + >>> plt.show() Args: - page: exported Page object to represent - draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 - font_size: size of the font, default font = 13 + ---- + page: the exported Page of a Document + image: np array of the page, needs to have the same shape than page['dimensions'] + words_only: whether only words should be displayed + display_artefacts: whether artefacts should be displayed + scale: figsize of the largest windows side + interactive: whether the plot should be interactive + add_labels: for static plot, adds text labels on top of bounding box + **kwargs: keyword arguments for the polygon patch - Return: - A np array (drawn page) + Returns: + ------- + the matplotlib figure """ - # Draw template - h, w = page["dimensions"] - response = 255 * np.ones((h, w, 3), dtype=np.int32) + # Get proper scale and aspect ratio + h, w = image.shape[:2] + size = (scale * w / h, scale) if h > w else (scale, h / w * scale) + fig, ax = plt.subplots(figsize=size) + # Display the image + ax.imshow(image) + # hide both axis + ax.axis("off") - # Draw each word - for block in page["blocks"]: - for line in block["lines"]: - for word in line["words"]: - # Get aboslute word geometry - (xmin, ymin), (xmax, ymax) = word["geometry"] - xmin, xmax = int(w * xmin), int(w * xmax) - ymin, ymax = int(h * ymin), int(h * ymax) - - # White drawing context adapted to font size, 0.75 factor to convert pts --> pix - h_box, w_box = ymax - ymin, xmax - xmin - h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75)) - img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255)) - d = ImageDraw.Draw(img) - - # Draw in black the value of the word - d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0)) - - # Resize back to box size - img = img.resize((w_box, h_box), Image.NEAREST) - - # Colorize if draw_proba - if draw_proba: - p = int(255 * word["confidence"]) - mask = np.where(np.array(img) == 0, 1, 0) - proba = np.array([255 - p, 0, p]) - color = mask * proba[np.newaxis, np.newaxis, :] - white_mask = 255 * (1 - mask) - img = color + white_mask - - # Write to response page - response[ymin:ymax, xmin:xmax, :] = np.array(img) - - return response + if interactive: + artists: List[patches.Patch] = [] # instantiate an empty list of patches (to be drawn on the page) + + colors = {k: color for color, k in zip(get_colors(len(page["predictions"])), page["predictions"])} + for key, value in page["predictions"].items(): + for prediction in value: + if not words_only: + rect = create_obj_patch( + prediction["geometry"], + page["dimensions"], + label=f"{key} \n {prediction['value']} (confidence: {prediction['confidence']:.2%}", + color=colors[key], + linewidth=1, + **kwargs, + ) + # add patch on figure + ax.add_patch(rect) + if interactive: + # add patch to cursor's artists + artists.append(rect) + + if interactive: + import mplcursors + + # Create mlp Cursor to hover patches in artists + mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label())) + fig.tight_layout(pad=0.0) + + return fig + + +def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None: + """Draw an array of relative straight boxes on an image + + Args: + ---- + boxes: array of relative boxes, of shape (*, 4) + image: np array, float32 or uint8 + color: color to use for bounding box edges + **kwargs: keyword arguments from `matplotlib.pyplot.plot` + """ + h, w = image.shape[:2] + # Convert boxes to absolute coords + _boxes = deepcopy(boxes) + _boxes[:, [0, 2]] *= w + _boxes[:, [1, 3]] *= h + _boxes = _boxes.astype(np.int32) + for box in _boxes.tolist(): + xmin, ymin, xmax, ymax = box + image = cv2.rectangle( + image, (xmin, ymin), (xmax, ymax), color=color if isinstance(color, tuple) else (0, 0, 255), thickness=2 + ) + plt.imshow(image) + plt.plot(**kwargs)
@@ -540,8 +719,8 @@

Source code for doctr.utils.visualization

       
     
   
- - + + diff --git a/v0.5.1/_modules/index.html b/v0.5.1/_modules/index.html index e86abcd4d4..5793c44f20 100644 --- a/v0.5.1/_modules/index.html +++ b/v0.5.1/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -225,20 +225,42 @@ - - + + diff --git a/v0.5.1/_sources/changelog.rst.txt b/v0.5.1/_sources/changelog.rst.txt index 430097d6c8..35befe7b96 100644 --- a/v0.5.1/_sources/changelog.rst.txt +++ b/v0.5.1/_sources/changelog.rst.txt @@ -1,6 +1,54 @@ Changelog ========= +v0.10.0 (2024-10-21) +------------------- +Release note: `v0.10.0 `_ + +v0.9.0 (2024-08-08) +------------------- +Release note: `v0.9.0 `_ + +v0.8.1 (2024-03-04) +------------------- +Release note: `v0.8.1 `_ + +v0.8.0 (2024-02-28) +------------------- +Release note: `v0.8.0 `_ + +v0.7.0 (2023-09-09) +------------------- +Release note: `v0.7.0 `_ + +v0.6.0 (2022-09-29) +------------------- +Release note: `v0.6.0 `_ + +v0.5.1 (2022-03-22) +------------------- +Release note: `v0.5.1 `_ + +v0.5.0 (2021-12-31) +------------------- +Release note: `v0.5.0 `_ + +v0.4.1 (2021-11-22) +------------------- +Release note: `v0.4.1 `_ + +v0.4.0 (2021-10-01) +------------------- +Release note: `v0.4.0 `_ + +v0.3.1 (2021-08-27) +------------------- +Release note: `v0.3.1 `_ + +v0.3.0 (2021-07-02) +------------------- +Release note: `v0.3.0 `_ + v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.5.1/_sources/datasets.rst.txt b/v0.5.1/_sources/datasets.rst.txt deleted file mode 100644 index 354122f1e5..0000000000 --- a/v0.5.1/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.datasets.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.5.1/_sources/documents.rst.txt b/v0.5.1/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.5.1/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.5.1/_sources/getting_started/installing.rst.txt b/v0.5.1/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/v0.5.1/_sources/getting_started/installing.rst.txt +++ b/v0.5.1/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/v0.5.1/_sources/index.rst.txt b/v0.5.1/_sources/index.rst.txt index fc3ff89fdf..53251db142 100644 --- a/v0.5.1/_sources/index.rst.txt +++ b/v0.5.1/_sources/index.rst.txt @@ -1,7 +1,8 @@ -DocTR: Document Text Recognition -================================ +******************************** +docTR: Document Text Recognition +******************************** -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -9,38 +10,29 @@ State-of-the-art Optical Character Recognition made seamless & accessible to any DocTR provides an easy and powerful way to extract valuable information from your documents: -* |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. +* |:receipt:| **for automation**: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. -Welcome to the documentation of `DocTR `_! - - Main Features ------------- * |:robot:| Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor -* |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract +* |:rocket:| State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, small dependencies -* |:tools:| Daily maintained -* |:factory:| Easy integration - +* |:bird:| Light package, minimal dependencies +* |:tools:| Actively maintained by Mindee +* |:factory:| Easy integration (available templates for browser demo & API deployment) -Getting Started ---------------- .. toctree:: :maxdepth: 2 + :caption: Getting started + :hidden: - installing - - -Build & train your predictor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) -* Fine-tune or train from scratch any detection or recognition model to specialize on your data + getting_started/installing + notebooks Model zoo @@ -48,36 +40,83 @@ Model zoo Text detection models """"""""""""""""""""" - * `DBNet `_ (Differentiable Binarization) - * `LinkNet `_ +* DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ +* LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ +* FAST from `"FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation" `_ Text recognition models """"""""""""""""""""""" - * `SAR `_ (Show, Attend and Read) - * `CRNN `_ (Convolutional Recurrent Neural Network) - * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) +* SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ +* CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ +* MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ +* ViTSTR from `"Vision Transformer for Fast and Efficient Scene Text Recognition" `_ +* PARSeq from `"Scene Text Recognition with Permuted Autoregressive Sequence Models" `_ Supported datasets ^^^^^^^^^^^^^^^^^^ - * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. - * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. - * SROIE from `ICDAR 2019 `_. +* FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. +* CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. +* SROIE from `ICDAR 2019 `_. +* IIIT-5k from `CVIT `_. +* Street View Text from `"End-to-End Scene Text Recognition" `_. +* SynthText from `Visual Geometry Group `_. +* SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. +* IC03 from `ICDAR 2003 `_. +* IC13 from `ICDAR 2013 `_. +* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. +* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" `_. +* IIITHWS from `"Generating Synthetic Data for Text Recognition" `_. +* WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" `_. .. toctree:: :maxdepth: 2 - :caption: Notes + :caption: Using docTR + :hidden: - changelog + using_doctr/using_models + using_doctr/using_datasets + using_doctr/using_contrib_modules + using_doctr/sharing_models + using_doctr/using_model_export + using_doctr/custom_models_training + using_doctr/running_on_aws + + +.. toctree:: + :maxdepth: 2 + :caption: Community + :hidden: + + community/resources .. toctree:: :maxdepth: 2 :caption: Package Reference + :hidden: - datasets - documents - models - transforms - utils + modules/contrib + modules/datasets + modules/io + modules/models + modules/transforms + modules/utils + + +.. toctree:: + :maxdepth: 2 + :caption: Contributing + :hidden: + + contributing/code_of_conduct + contributing/contributing + + +.. toctree:: + :maxdepth: 2 + :caption: Notes + :hidden: + + changelog diff --git a/v0.5.1/_sources/installing.rst.txt b/v0.5.1/_sources/installing.rst.txt deleted file mode 100644 index 5c8779dc1c..0000000000 --- a/v0.5.1/_sources/installing.rst.txt +++ /dev/null @@ -1,46 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: - -* TensorFlow: `installation page `_. -* PyTorch: `installation page `_. - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.5.1/_sources/models.rst.txt b/v0.5.1/_sources/models.rst.txt deleted file mode 100644 index 9830c6c153..0000000000 --- a/v0.5.1/_sources/models.rst.txt +++ /dev/null @@ -1,215 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet16 - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 -.. autofunction:: doctr.models.recognition.master - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.5.1/_sources/transforms.rst.txt b/v0.5.1/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.5.1/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.5.1/_sources/utils.rst.txt b/v0.5.1/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.5.1/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.5.1/_static/basic.css b/v0.5.1/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.5.1/_static/basic.css +++ b/v0.5.1/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.5.1/_static/doctools.js b/v0.5.1/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.5.1/_static/doctools.js +++ b/v0.5.1/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.5.1/_static/documentation_options.js b/v0.5.1/_static/documentation_options.js index a7b5cbe04a..4f656fdbea 100644 --- a/v0.5.1/_static/documentation_options.js +++ b/v0.5.1/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.3.0a0-git', + VERSION: '0.10.1a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.5.1/_static/language_data.js b/v0.5.1/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.5.1/_static/language_data.js +++ b/v0.5.1/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.5.1/_static/searchtools.js b/v0.5.1/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.5.1/_static/searchtools.js +++ b/v0.5.1/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.5.1/changelog.html b/v0.5.1/changelog.html index eafac3a877..fc45a50384 100644 --- a/v0.5.1/changelog.html +++ b/v0.5.1/changelog.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + Changelog - docTR documentation @@ -226,20 +226,42 @@ + diff --git a/v0.5.1/community/resources.html b/v0.5.1/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.5.1/community/resources.html +++ b/v0.5.1/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.5.1/contributing/code_of_conduct.html b/v0.5.1/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/v0.5.1/contributing/code_of_conduct.html +++ b/v0.5.1/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/v0.5.1/contributing/contributing.html b/v0.5.1/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/v0.5.1/contributing/contributing.html +++ b/v0.5.1/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/v0.5.1/datasets.html b/v0.5.1/datasets.html deleted file mode 100644 index 193e576c57..0000000000 --- a/v0.5.1/datasets.html +++ /dev/null @@ -1,578 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
  • sos – optional encoding of Start Of String

  • -
  • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/documents.html b/v0.5.1/documents.html deleted file mode 100644 index 98cbb2c5ef..0000000000 --- a/v0.5.1/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/genindex.html b/v0.5.1/genindex.html index a19b433943..21520455b4 100644 --- a/v0.5.1/genindex.html +++ b/v0.5.1/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -224,20 +224,42 @@

+
+

U

+ + +
+
+

V

@@ -561,7 +711,13 @@

V

W

+
@@ -599,8 +755,8 @@

W

- - + + diff --git a/v0.5.1/getting_started/installing.html b/v0.5.1/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/v0.5.1/getting_started/installing.html +++ b/v0.5.1/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/v0.5.1/index.html b/v0.5.1/index.html index 4c6a28c66a..3a06afc6d9 100644 --- a/v0.5.1/index.html +++ b/v0.5.1/index.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + docTR documentation @@ -226,20 +226,42 @@
-

DocTR: Document Text Recognition

-

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta)

+

docTR: Document Text Recognition

+

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch

https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png

DocTR provides an easy and powerful way to extract valuable information from your documents:

    -
  • 🧾 for automation: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • +
  • 🧾 for automation: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • 👩‍🔬 for research: quickly compare your own architectures speed & performances with state-of-art models on public datasets.

-

Welcome to the documentation of DocTR!

Main Features

  • 🤖 Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters

  • ⚡ User-friendly, 3 lines of code to load a document and extract text with a predictor

  • -
  • 🚀 State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract

  • +
  • 🚀 State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract

  • ⚡ Optimized for inference speed on both CPU & GPU

  • -
  • 🐦 Light package, small dependencies

  • -
  • 🛠️ Daily maintained

  • -
  • 🏭 Easy integration

  • +
  • 🐦 Light package, minimal dependencies

  • +
  • 🛠️ Actively maintained by Mindee

  • +
  • 🏭 Easy integration (available templates for browser demo & API deployment)

-
-
-

Getting Started

-
-

Build & train your predictor

-
    -
  • Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained)

  • -
  • Fine-tune or train from scratch any detection or recognition model to specialize on your data

  • -
-

Model zoo

Text detection models

-
-

Text recognition models

-
-

Supported datasets

-
-
+
+
+
+
+
@@ -406,7 +381,7 @@

Supported datasets - +
Next @@ -446,10 +421,8 @@

Supported datasets + diff --git a/v0.5.1/installing.html b/v0.5.1/installing.html deleted file mode 100644 index b61c60134b..0000000000 --- a/v0.5.1/installing.html +++ /dev/null @@ -1,395 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

- -

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/models.html b/v0.5.1/models.html deleted file mode 100644 index b5cd44c9fa..0000000000 --- a/v0.5.1/models.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet16
->>> model = linknet16(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
-

MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. -Example:

-
>>> import tensorflow as tf
->>> from doctr.models import master
->>> model = master(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

59.50

62.50

75.30

70.00

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

66.00

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/modules/contrib.html b/v0.5.1/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.5.1/modules/contrib.html +++ b/v0.5.1/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.5.1/modules/datasets.html b/v0.5.1/modules/datasets.html index 456e10b172..380a986793 100644 --- a/v0.5.1/modules/datasets.html +++ b/v0.5.1/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1081,7 +1081,7 @@

Returns:

- + diff --git a/v0.5.1/modules/io.html b/v0.5.1/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/v0.5.1/modules/io.html +++ b/v0.5.1/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/v0.5.1/modules/models.html b/v0.5.1/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/v0.5.1/modules/models.html +++ b/v0.5.1/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/v0.5.1/modules/transforms.html b/v0.5.1/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/v0.5.1/modules/transforms.html +++ b/v0.5.1/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/v0.5.1/modules/utils.html b/v0.5.1/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/v0.5.1/modules/utils.html +++ b/v0.5.1/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/v0.5.1/notebooks.html b/v0.5.1/notebooks.html index f97771aebb..d36539f59e 100644 --- a/v0.5.1/notebooks.html +++ b/v0.5.1/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/v0.5.1/objects.inv b/v0.5.1/objects.inv index a22d2ce821..c1700f291b 100644 Binary files a/v0.5.1/objects.inv and b/v0.5.1/objects.inv differ diff --git a/v0.5.1/py-modindex.html b/v0.5.1/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.5.1/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.5.1/search.html b/v0.5.1/search.html index 73772822d2..d050f5eac7 100644 --- a/v0.5.1/search.html +++ b/v0.5.1/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -226,20 +226,42 @@ - - + + diff --git a/v0.5.1/searchindex.js b/v0.5.1/searchindex.js index 803f4f4bcf..6f154115ab 100644 --- a/v0.5.1/searchindex.js +++ b/v0.5.1/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": 5, "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": 5, "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "2023": [], "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": 3, "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": [], "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "andrej": [], "angl": 2, "ani": [1, 2, 3, 5, 6, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": [], "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "articl": [], "artifici": [], "arxiv": 5, "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": [3, 6], "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "baranovskij": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 3, "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": 2, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [1, 2, 6, 7], "box": [1, 2, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "christian": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": [], "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "cool": [], "coordin": 2, "cord": [1, 3, 5], "core": 7, "corner": [], "correct": 6, "correspond": [4, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": 4, "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "found": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gallagh": [], "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ibrahimov": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": [1, 2], "insult": [], "int": [1, 2, 5, 6, 7], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "jame": [], "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": 4, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [3, 5], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "moscardi": [], "most": 5, "mozilla": [], "multi": 3, "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": [], "netraj": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 3, 6, 7], "none": [1, 2, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 3, "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": 1, "order": [1, 2, 5], "org": 5, "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 5, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [4, 5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "parallel": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "patil": [], "pattern": [], "pdf": [2, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [3, 4], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": [], "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "realli": [], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": 4, "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "roboflow": [], "robust": 3, "root": 1, "rotat": [1, 2], "rotated_bbox": [1, 7], "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sanjin": [], "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": [3, 5], "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [1, 4], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standalon": [], "standard": 6, "start": 1, "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": [], "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unit": [], "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "verma": [], "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "video": [], "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": 4, "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "yugesh": [], "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "21": [], "22": [], "27": [], "28": 0, "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": [], "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "resourc": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/v0.5.1/transforms.html b/v0.5.1/transforms.html deleted file mode 100644 index 85e94d8a76..0000000000 --- a/v0.5.1/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.5)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[Callable[[Any], Any]])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[Callable[[Any], Any]])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: Callable[[Any], Any], p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.5.1/using_doctr/custom_models_training.html b/v0.5.1/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/v0.5.1/using_doctr/custom_models_training.html +++ b/v0.5.1/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/v0.5.1/using_doctr/running_on_aws.html b/v0.5.1/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/v0.5.1/using_doctr/running_on_aws.html +++ b/v0.5.1/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/v0.5.1/using_doctr/sharing_models.html b/v0.5.1/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/v0.5.1/using_doctr/sharing_models.html +++ b/v0.5.1/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/v0.5.1/using_doctr/using_contrib_modules.html b/v0.5.1/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.5.1/using_doctr/using_contrib_modules.html +++ b/v0.5.1/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.5.1/using_doctr/using_datasets.html b/v0.5.1/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/v0.5.1/using_doctr/using_datasets.html +++ b/v0.5.1/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/v0.5.1/using_doctr/using_model_export.html b/v0.5.1/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/v0.5.1/using_doctr/using_model_export.html +++ b/v0.5.1/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/v0.5.1/using_doctr/using_models.html b/v0.5.1/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/v0.5.1/using_doctr/using_models.html +++ b/v0.5.1/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/v0.5.1/utils.html b/v0.5.1/utils.html deleted file mode 100644 index e2f223f06a..0000000000 --- a/v0.5.1/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float | None, float | None, float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/datasets/cord.html b/v0.6.0/_modules/doctr/datasets/cord.html index f98ee6901c..55b0584830 100644 --- a/v0.6.0/_modules/doctr/datasets/cord.html +++ b/v0.6.0/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.cord

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
-from doctr.utils.geometry import fit_rbbox
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['CORD']
+__all__ = ["CORD"]
 
 
 
-[docs] +[docs] class CORD(VisionDataset): """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" <https://openreview.net/pdf?id=SJl3z659UH>`_. - Example:: - >>> from doctr.datasets import CORD - >>> train_set = CORD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0 + :align: center + + >>> from doctr.datasets import CORD + >>> train_set = CORD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip', - '45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_test.zip', - '8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0", + "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8", + "cord_train.zip", + ) + + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0", + "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58", + "cord_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - - # # List images - self.root = os.path.join(self._root, 'image') - self.data: List[Tuple[str, Dict[str, Any]]] = [] + # List images + tmp_root = os.path.join(self.root, "image") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] self.train = train - self.sample_transforms = sample_transforms - for img_path in os.listdir(self.root): + np_dtype = np.float32 + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem _targets = [] - with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f: label = json.load(f) for line in label["valid_line"]: for word in line["words"]: if len(word["text"]) > 0: x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] - if rotated_bbox: - box = list(fit_rbbox(np.array([ - [x[0], y[0]], - [x[1], y[1]], - [x[2], y[2]], - [x[3], y[3]], - ], dtype=np.float32))) + box: Union[List[float], np.ndarray] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + box = np.array( + [ + [x[0], y[0]], + [x[1], y[1]], + [x[2], y[2]], + [x[3], y[3]], + ], + dtype=np_dtype, + ) else: - # Reduce 8 coords to 4 + # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax box = [min(x), min(y), max(x), max(y)] - _targets.append((word['text'], box)) + _targets.append((word["text"], box)) text_targets, box_targets = zip(*_targets) - self.data.append(( - img_path, - dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets) - )) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) + ) + for crop, label in zip(crops, list(text_targets)): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -397,8 +461,8 @@

Source code for doctr.datasets.cord

       
     
   
-
- + + diff --git a/v0.6.0/_modules/doctr/datasets/core.html b/v0.6.0/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.6.0/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.6.0/_modules/doctr/datasets/datasets/tensorflow.html deleted file mode 100644 index a236abd9fe..0000000000 --- a/v0.6.0/_modules/doctr/datasets/datasets/tensorflow.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - - - - - doctr.datasets.datasets.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.datasets.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from typing import List, Any, Tuple
-import tensorflow as tf
-
-from .base import _AbstractDataset, _VisionDataset
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset(_AbstractDataset):
-
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-
-        return img, target
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset, _VisionDataset): - pass
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/datasets/detection.html b/v0.6.0/_modules/doctr/datasets/detection.html index 739563e466..718001e4cf 100644 --- a/v0.6.0/_modules/doctr/datasets/detection.html +++ b/v0.6.0/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -430,7 +430,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.6.0/_modules/doctr/datasets/doc_artefacts.html b/v0.6.0/_modules/doctr/datasets/doc_artefacts.html index 3313ae4660..94c32aaa0f 100644 --- a/v0.6.0/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.6.0/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.6.0/_modules/doctr/datasets/funsd.html b/v0.6.0/_modules/doctr/datasets/funsd.html index 35d7ad4cf5..f08612f9fa 100644 --- a/v0.6.0/_modules/doctr/datasets/funsd.html +++ b/v0.6.0/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.funsd

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['FUNSD']
+__all__ = ["FUNSD"]
 
 
 
-[docs] +[docs] class FUNSD(VisionDataset): """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. - Example:: - >>> from doctr.datasets import FUNSD - >>> train_set = FUNSD(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0 + :align: center + + >>> from doctr.datasets import FUNSD + >>> train_set = FUNSD(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip' - SHA256 = 'c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f' - FILE_NAME = 'funsd.zip' + URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip" + SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f" + FILE_NAME = "funsd.zip" def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + super().__init__( + self.URL, + self.FILE_NAME, + self.SHA256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - super().__init__(self.URL, self.FILE_NAME, self.SHA256, True, **kwargs) self.train = train - self.sample_transforms = sample_transforms + np_dtype = np.float32 # Use the subset - subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data') + subfolder = os.path.join("dataset", "training_data" if train else "testing_data") # # List images - self.root = os.path.join(self._root, subfolder, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + tmp_root = os.path.join(self.root, subfolder, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f: + with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f: data = json.load(f) - _targets = [(word['text'], word['box']) for block in data['form'] - for word in block['words'] if len(word['text']) > 0] + _targets = [ + (word["text"], word["box"]) + for block in data["form"] + for word in block["words"] + if len(word["text"]) > 0 + ] text_targets, box_targets = zip(*_targets) - if rotated_bbox: - # box_targets: xmin, ymin, xmax, ymax -> x, y, w, h, alpha = 0 - box_targets = [ + if use_polygons: + # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners + box_targets = [ # type: ignore[assignment] [ - (box[0] + box[2]) / 2, (box[1] + box[3]) / 2, box[2] - box[0], box[3] - box[1], 0 - ] for box in box_targets + [box[0], box[1]], + [box[2], box[1]], + [box[2], box[3]], + [box[0], box[3]], + ] + for box in box_targets ] - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=int), labels=text_targets))) + if recognition_task: + crops = crop_bboxes_from_image( + img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype) + ) + for crop, label in zip(crops, list(text_targets)): + # filter labels with unknown characters + if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype))) + else: + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)), + )) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -386,8 +453,8 @@

Source code for doctr.datasets.funsd

       
     
   
-
- + + diff --git a/v0.6.0/_modules/doctr/datasets/generator/tensorflow.html b/v0.6.0/_modules/doctr/datasets/generator/tensorflow.html index 9f562582d9..a3e619f720 100644 --- a/v0.6.0/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.6.0/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.6.0/_modules/doctr/datasets/ic03.html b/v0.6.0/_modules/doctr/datasets/ic03.html index 3d221d07de..60e54a8a4b 100644 --- a/v0.6.0/_modules/doctr/datasets/ic03.html +++ b/v0.6.0/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -468,7 +468,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.6.0/_modules/doctr/datasets/ic13.html b/v0.6.0/_modules/doctr/datasets/ic13.html index 8137e08e9f..219c98dcd1 100644 --- a/v0.6.0/_modules/doctr/datasets/ic13.html +++ b/v0.6.0/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -440,7 +440,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.6.0/_modules/doctr/datasets/iiit5k.html b/v0.6.0/_modules/doctr/datasets/iiit5k.html index 1fc8ecfb27..b49c80fe18 100644 --- a/v0.6.0/_modules/doctr/datasets/iiit5k.html +++ b/v0.6.0/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.6.0/_modules/doctr/datasets/iiithws.html b/v0.6.0/_modules/doctr/datasets/iiithws.html index 07f5b13685..f7220afbc7 100644 --- a/v0.6.0/_modules/doctr/datasets/iiithws.html +++ b/v0.6.0/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.6.0/_modules/doctr/datasets/imgur5k.html b/v0.6.0/_modules/doctr/datasets/imgur5k.html index 68d433ca62..51c6545db4 100644 --- a/v0.6.0/_modules/doctr/datasets/imgur5k.html +++ b/v0.6.0/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -488,7 +488,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.6.0/_modules/doctr/datasets/loader.html b/v0.6.0/_modules/doctr/datasets/loader.html index d32e6da298..ed80350ef0 100644 --- a/v0.6.0/_modules/doctr/datasets/loader.html +++ b/v0.6.0/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.loader

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import math
-import tensorflow as tf
-import numpy as np
-from typing import Optional
+from typing import Callable, Optional
 
-from .multithreading import multithread_exec
+import numpy as np
+import tensorflow as tf
 
 __all__ = ["DataLoader"]
 
@@ -293,12 +314,13 @@ 

Source code for doctr.datasets.loader

     """Collate multiple elements into batches
 
     Args:
+    ----
         samples: list of N tuples containing M elements
 
     Returns:
+    -------
         Tuple of M sequences contianing N elements each
     """
-
     batch_data = zip(*samples)
 
     tf_data = tuple(tf.stack(elt, axis=0) for elt in batch_data)
@@ -307,23 +329,23 @@ 

Source code for doctr.datasets.loader

 
 
 
-[docs] +[docs] class DataLoader: """Implements a dataset wrapper for fast data loading - Example:: - >>> from doctr.datasets import FUNSD, DataLoader - >>> train_set = CORD(train=True, download=True) - >>> train_loader = DataLoader(train_set, batch_size=32) - >>> train_iter = iter(train_loader) - >>> images, targets = next(train_iter) + >>> from doctr.datasets import CORD, DataLoader + >>> train_set = CORD(train=True, download=True) + >>> train_loader = DataLoader(train_set, batch_size=32) + >>> train_iter = iter(train_loader) + >>> images, targets = next(train_iter) Args: + ---- dataset: the dataset shuffle: whether the samples should be shuffled before passing it to the iterator batch_size: number of elements in each batch drop_last: if `True`, drops the last batch if it isn't full - workers: number of workers to use for data loading + collate_fn: function to merge samples into a batch """ def __init__( @@ -332,17 +354,22 @@

Source code for doctr.datasets.loader

         shuffle: bool = True,
         batch_size: int = 1,
         drop_last: bool = False,
-        workers: Optional[int] = None,
+        collate_fn: Optional[Callable] = None,
     ) -> None:
         self.dataset = dataset
         self.shuffle = shuffle
         self.batch_size = batch_size
         nb = len(self.dataset) / batch_size
         self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
-        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
-        self.workers = workers
+        if collate_fn is None:
+            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
+        else:
+            self.collate_fn = collate_fn
         self.reset()
 
+    def __len__(self) -> int:
+        return self.num_batches
+
     def reset(self) -> None:
         # Updates indices after each epoch
         self._num_yielded = 0
@@ -358,9 +385,9 @@ 

Source code for doctr.datasets.loader

         if self._num_yielded < self.num_batches:
             # Get next indices
             idx = self._num_yielded * self.batch_size
-            indices = self.indices[idx: min(len(self.dataset), idx + self.batch_size)]
+            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
 
-            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.workers)
+            samples = list(map(self.dataset.__getitem__, indices))
 
             batch_data = self.collate_fn(samples)
 
@@ -401,8 +428,8 @@ 

Source code for doctr.datasets.loader

       
     
   
-
- +
+ diff --git a/v0.6.0/_modules/doctr/datasets/mjsynth.html b/v0.6.0/_modules/doctr/datasets/mjsynth.html index 77bb01d523..df34e49cf9 100644 --- a/v0.6.0/_modules/doctr/datasets/mjsynth.html +++ b/v0.6.0/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.6.0/_modules/doctr/datasets/ocr.html b/v0.6.0/_modules/doctr/datasets/ocr.html index 11297d5952..ce1ed8b0d4 100644 --- a/v0.6.0/_modules/doctr/datasets/ocr.html +++ b/v0.6.0/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.ocr

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import json
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple
 
-from .datasets import AbstractDataset
-from doctr.utils.geometry import fit_rbbox
+import numpy as np
 
+from .datasets import AbstractDataset
 
-__all__ = ['OCRDataset']
+__all__ = ["OCRDataset"]
 
 
 
-[docs] +[docs] class OCRDataset(AbstractDataset): """Implements an OCR dataset + >>> from doctr.datasets import OCRDataset + >>> train_set = OCRDataset(img_folder="/path/to/images", + >>> label_file="/path/to/labels.json") + >>> img, target = train_set[0] + Args: + ---- img_folder: local path to image folder (all jpg at the root) label_file: local path to the label file - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) - **kwargs: keyword arguments from `VisionDataset`. + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + **kwargs: keyword arguments from `AbstractDataset`. """ def __init__( self, img_folder: str, label_file: str, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, **kwargs: Any, ) -> None: - - self.sample_transforms = sample_transforms - self.root = img_folder + super().__init__(img_folder, **kwargs) # List images self.data: List[Tuple[str, Dict[str, Any]]] = [] - with open(label_file, 'rb') as f: + np_dtype = np.float32 + with open(label_file, "rb") as f: data = json.load(f) - for file_dic in data: + for img_name, annotations in data.items(): # Get image path - img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg' + img_name = Path(img_name) # File existence check if not os.path.exists(os.path.join(self.root, img_name)): raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") # handle empty images - if (len(file_dic["coordinates"]) == 0 or - (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")): - self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[]))) + if len(annotations["typed_words"]) == 0: + self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[]))) continue - is_valid: List[bool] = [] - box_targets: List[List[float]] = [] - for box in file_dic["coordinates"]: - if rotated_bbox: - x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32)) - box = [x, y, w, h, alpha] - is_valid.append(w > 0 and h > 0) - else: - xs, ys = zip(*box) - box = [min(xs), min(ys), max(xs), max(ys)] - is_valid.append(box[0] < box[2] and box[1] < box[3]) - if is_valid[-1]: - box_targets.append(box) + # Unpack the straight boxes (xmin, ymin, xmax, ymax) + geoms = [list(map(float, obj["geometry"][:4])) for obj in annotations["typed_words"]] + if use_polygons: + # (x, y) coordinates of top left, top right, bottom right, bottom left corners + geoms = [ + [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]] # type: ignore[list-item] + for geom in geoms + ] + + text_targets = [obj["value"] for obj in annotations["typed_words"]] - text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid] - self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
+ self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))
@@ -383,8 +402,8 @@

Source code for doctr.datasets.ocr

       
     
   
- - + + diff --git a/v0.6.0/_modules/doctr/datasets/recognition.html b/v0.6.0/_modules/doctr/datasets/recognition.html index 512c70c308..1754789364 100644 --- a/v0.6.0/_modules/doctr/datasets/recognition.html +++ b/v0.6.0/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.6.0/_modules/doctr/datasets/sroie.html b/v0.6.0/_modules/doctr/datasets/sroie.html index 66fd4ca3e0..04cf10bda2 100644 --- a/v0.6.0/_modules/doctr/datasets/sroie.html +++ b/v0.6.0/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.sroie

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import os
 import csv
-import numpy as np
+import os
 from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+from tqdm import tqdm
 
 from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
 
-__all__ = ['SROIE']
+__all__ = ["SROIE"]
 
 
 
-[docs] +[docs] class SROIE(VisionDataset): """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction" <https://arxiv.org/pdf/2103.10213.pdf>`_. - Example:: - >>> from doctr.datasets import SROIE - >>> train_set = SROIE(train=True, download=True) - >>> img, target = train_set[0] + .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0 + :align: center + + >>> from doctr.datasets import SROIE + >>> train_set = SROIE(train=True, download=True) + >>> img, target = train_set[0] Args: + ---- train: whether the subset should be the training one - sample_transforms: composable transformations that will be applied to each image - rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + recognition_task: whether the dataset should be used for recognition task + detection_task: whether the dataset should be used for detection task **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_train_task1.zip', - 'd4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f') - TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_test.zip', - '41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2') + TRAIN = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0", + "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f", + "sroie2019_train_task1.zip", + ) + TEST = ( + "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0", + "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2", + "sroie2019_test.zip", + ) def __init__( self, train: bool = True, - sample_transforms: Optional[Callable[[Any], Any]] = None, - rotated_bbox: bool = False, + use_polygons: bool = False, + recognition_task: bool = False, + detection_task: bool = False, **kwargs: Any, ) -> None: + url, sha256, name = self.TRAIN if train else self.TEST + super().__init__( + url, + name, + sha256, + True, + pre_transforms=convert_target_to_relative if not recognition_task else None, + **kwargs, + ) + if recognition_task and detection_task: + raise ValueError( + "`recognition_task` and `detection_task` cannot be set to True simultaneously. " + + "To get the whole dataset with boxes and labels leave both parameters to False." + ) - url, sha256 = self.TRAIN if train else self.TEST - super().__init__(url, None, sha256, True, **kwargs) - self.sample_transforms = sample_transforms self.train = train - if rotated_bbox: - raise NotImplementedError + tmp_root = os.path.join(self.root, "images") + self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = [] + np_dtype = np.float32 - # # List images - self.root = os.path.join(self._root, 'images') - self.data: List[Tuple[str, Dict[str, Any]]] = [] - for img_path in os.listdir(self.root): + for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))): # File existence check - if not os.path.exists(os.path.join(self.root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") + if not os.path.exists(os.path.join(tmp_root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") + stem = Path(img_path).stem - _targets = [] - with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f: - for row in csv.reader(f, delimiter=','): - # Safeguard for blank lines - if len(row) > 0: - # Label may contain commas - label = ",".join(row[8:]) - # Reduce 8 coords to 4 - p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x, p4_y = map(int, row[:8]) - left, right = min(p1_x, p2_x, p3_x, p4_x), max(p1_x, p2_x, p3_x, p4_x) - top, bot = min(p1_y, p2_y, p3_y, p4_y), max(p1_y, p2_y, p3_y, p4_y) - if len(label) > 0: - _targets.append((label, [left, top, right, bot])) - - text_targets, box_targets = zip(*_targets) - - self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets))) + with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f: + _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0] + + labels = [",".join(row[8:]) for row in _rows] + # reorder coordinates (8 -> (4,2) -> + # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines + coords: np.ndarray = np.stack( + [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0 + ) + + if not use_polygons: + # xmin, ymin, xmax, ymax + coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1) + + if recognition_task: + crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords) + for crop, label in zip(crops, labels): + if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0: + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, coords)) + else: + self.data.append((img_path, dict(boxes=coords, labels=labels))) + + self.root = tmp_root def extra_repr(self) -> str: return f"train={self.train}"
@@ -390,8 +444,8 @@

Source code for doctr.datasets.sroie

       
     
   
-
- + + diff --git a/v0.6.0/_modules/doctr/datasets/svhn.html b/v0.6.0/_modules/doctr/datasets/svhn.html index 48e4e4d210..60e02b1b3b 100644 --- a/v0.6.0/_modules/doctr/datasets/svhn.html +++ b/v0.6.0/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.6.0/_modules/doctr/datasets/svt.html b/v0.6.0/_modules/doctr/datasets/svt.html index 4144dc6b9b..a997fcbb50 100644 --- a/v0.6.0/_modules/doctr/datasets/svt.html +++ b/v0.6.0/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -459,7 +459,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.6.0/_modules/doctr/datasets/synthtext.html b/v0.6.0/_modules/doctr/datasets/synthtext.html index 3b9de506a7..c776e1d673 100644 --- a/v0.6.0/_modules/doctr/datasets/synthtext.html +++ b/v0.6.0/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -470,7 +470,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.6.0/_modules/doctr/datasets/utils.html b/v0.6.0/_modules/doctr/datasets/utils.html index 2259698c0f..bde9304597 100644 --- a/v0.6.0/_modules/doctr/datasets/utils.html +++ b/v0.6.0/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.datasets.utils

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import string
 import unicodedata
+from collections.abc import Sequence
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Sequence as SequenceType
+
 import numpy as np
-from typing import List, Optional, Any
+from PIL import Image
+
+from doctr.io.image import get_img_shape
+from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
 
 from .vocabs import VOCABS
 
-__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
+__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences", "pre_transform_multiclass"]
+
+ImageTensor = TypeVar("ImageTensor")
 
 
 def translate(
     input_string: str,
     vocab_name: str,
-    unknown_char: str = '■',
+    unknown_char: str = "■",
 ) -> str:
     """Translate a string input in a given vocabulary
 
     Args:
+    ----
         input_string: input string to translate
         vocab_name: vocabulary to use (french, latin, ...)
         unknown_char: unknown character for non-translatable characters
 
     Returns:
-        A string translated in a given vocab"""
-
+    -------
+        A string translated in a given vocab
+    """
     if VOCABS.get(vocab_name) is None:
         raise KeyError("output vocabulary must be in vocabs dictionnary")
 
-    translated = ''
+    translated = ""
     for char in input_string:
         if char not in VOCABS[vocab_name]:
             # we need to translate char into a vocab char
@@ -315,51 +350,63 @@ 

Source code for doctr.datasets.utils

                 # remove whitespaces
                 continue
             # normalize character if it is not in vocab
-            char = unicodedata.normalize('NFD', char).encode('ascii', 'ignore').decode('ascii')
-            if char == '' or char not in VOCABS[vocab_name]:
+            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
+            if char == "" or char not in VOCABS[vocab_name]:
                 # if normalization fails or char still not in vocab, return unknown character)
                 char = unknown_char
         translated += char
     return translated
 
 
-def encode_sequence(
+def encode_string(
     input_string: str,
     vocab: str,
 ) -> List[int]:
     """Given a predefined mapping, encode the string to a sequence of numbers
 
     Args:
+    ----
         input_string: string to encode
         vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A list encoding the input_string"""
-
-    return list(map(vocab.index, input_string))  # type: ignore[arg-type]
+    -------
+        A list encoding the input_string
+    """
+    try:
+        return list(map(vocab.index, input_string))
+    except ValueError:
+        raise ValueError(
+            f"some characters cannot be found in 'vocab'. \
+                         Please check the input string {input_string} and the vocabulary {vocab}"
+        )
 
 
 def decode_sequence(
-    input_array: np.array,
+    input_seq: Union[np.ndarray, SequenceType[int]],
     mapping: str,
 ) -> str:
     """Given a predefined mapping, decode the sequence of numbers to a string
 
     Args:
-        input_array: array to decode
+    ----
+        input_seq: array to decode
         mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
 
     Returns:
-        A string, decoded from input_array"""
-
-    if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
+    -------
+        A string, decoded from input_seq
+    """
+    if not isinstance(input_seq, (Sequence, np.ndarray)):
+        raise TypeError("Invalid sequence type")
+    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
         raise AssertionError("Input must be an array of int, with max less than mapping size")
-    decoded = ''.join(mapping[idx] for idx in input_array)
-    return decoded
+
+    return "".join(map(mapping.__getitem__, input_seq))
 
 
 
-[docs] +[docs] def encode_sequences( sequences: List[str], vocab: str, @@ -367,48 +414,53 @@

Source code for doctr.datasets.utils

     eos: int = -1,
     sos: Optional[int] = None,
     pad: Optional[int] = None,
-    **kwargs: Any,
+    dynamic_seq_length: bool = False,
 ) -> np.ndarray:
     """Encode character sequences using a given vocab as mapping
 
     Args:
+    ----
         sequences: the list of character sequences of size N
         vocab: the ordered vocab to use for encoding
         target_size: maximum length of the encoded data
         eos: encoding of End Of String
         sos: optional encoding of Start Of String
         pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
+        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
 
     Returns:
+    -------
         the padded encoded data as a tensor
     """
-
     if 0 <= eos < len(vocab):
         raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
 
-    if not isinstance(target_size, int):
-        target_size = max(len(w) for w in sequences)
-        if sos:
-            target_size += 1
-        if pad:
-            target_size += 1
+    if not isinstance(target_size, int) or dynamic_seq_length:
+        # Maximum string length + EOS
+        max_length = max(len(w) for w in sequences) + 1
+        if isinstance(sos, int):
+            max_length += 1
+        if isinstance(pad, int):
+            max_length += 1
+        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
 
     # Pad all sequences
-    if pad:  # pad with padding symbol
+    if isinstance(pad, int):  # pad with padding symbol
         if 0 <= pad < len(vocab):
             raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
         # In that case, add EOS at the end of the word before padding
-        encoded_data = np.full([len(sequences), target_size], pad, dtype=np.int32)
+        default_symbol = pad
     else:  # pad with eos symbol
-        encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32)
+        default_symbol = eos
+    encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
 
-    for idx, seq in enumerate(sequences):
-        encoded_seq = encode_sequence(seq, vocab)
-        if pad:  # add eos at the end of the sequence
-            encoded_seq.append(eos)
-        encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)]
+    # Encode the strings
+    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
+        if isinstance(pad, int):  # add eos at the end of the sequence
+            seq.append(eos)
+        encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)]
 
-    if sos:  # place eos symbol at the beginning of each sequence
+    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
         if 0 <= sos < len(vocab):
             raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
         encoded_data = np.roll(encoded_data, 1)
@@ -416,6 +468,59 @@ 

Source code for doctr.datasets.utils

 
     return encoded_data
+ + +def convert_target_to_relative( + img: ImageTensor, target: Union[np.ndarray, Dict[str, Any]] +) -> Tuple[ImageTensor, Union[Dict[str, Any], np.ndarray]]: + if isinstance(target, np.ndarray): + target = convert_to_relative_coords(target, get_img_shape(img)) + else: + target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img)) + return img, target + + +def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]: + """Crop a set of bounding boxes from an image + + Args: + ---- + img_path: path to the image + geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4) + + Returns: + ------- + a list of cropped images + """ + with Image.open(img_path) as pil_img: + img: np.ndarray = np.array(pil_img.convert("RGB")) + # Polygon + if geoms.ndim == 3 and geoms.shape[1:] == (4, 2): + return extract_rcrops(img, geoms.astype(dtype=int)) + if geoms.ndim == 2 and geoms.shape[1] == 4: + return extract_crops(img, geoms.astype(dtype=int)) + raise ValueError("Invalid geometry format") + + +def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.ndarray, Dict[str, List]]: + """Converts multiclass target to relative coordinates. + + Args: + ---- + img: Image + target: tuple of target polygons and their classes names + + Returns: + ------- + Image and dictionary of boxes, with class names as keys + """ + boxes = convert_to_relative_coords(target[0], get_img_shape(img)) + boxes_classes = target[1] + boxes_dict: Dict = {k: [] for k in sorted(set(boxes_classes))} + for k, poly in zip(boxes_classes, boxes): + boxes_dict[k].append(poly) + boxes_dict = {k: np.stack(v, axis=0) for k, v in boxes_dict.items()} + return img, boxes_dict
@@ -448,8 +553,8 @@

Source code for doctr.datasets.utils

       
     
   
- - + + diff --git a/v0.6.0/_modules/doctr/datasets/wildreceipt.html b/v0.6.0/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.6.0/_modules/doctr/datasets/wildreceipt.html +++ b/v0.6.0/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.6.0/_modules/doctr/documents/elements.html b/v0.6.0/_modules/doctr/documents/elements.html deleted file mode 100644 index 10c1e142d2..0000000000 --- a/v0.6.0/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,577 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox, RotatedBbox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator, arg-type] - - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/documents/reader.html b/v0.6.0/_modules/doctr/documents/reader.html deleted file mode 100644 index cdcd814b6c..0000000000 --- a/v0.6.0/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,612 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence, Dict
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/io/elements.html b/v0.6.0/_modules/doctr/io/elements.html index 753a47455c..e049d6ce30 100644 --- a/v0.6.0/_modules/doctr/io/elements.html +++ b/v0.6.0/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -1008,7 +1008,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.6.0/_modules/doctr/io/html.html b/v0.6.0/_modules/doctr/io/html.html index 7ad5b97031..be73631500 100644 --- a/v0.6.0/_modules/doctr/io/html.html +++ b/v0.6.0/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -360,7 +360,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.6.0/_modules/doctr/io/image/base.html b/v0.6.0/_modules/doctr/io/image/base.html index 336b4bff0e..a50c95d595 100644 --- a/v0.6.0/_modules/doctr/io/image/base.html +++ b/v0.6.0/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -388,7 +388,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.6.0/_modules/doctr/io/image/tensorflow.html b/v0.6.0/_modules/doctr/io/image/tensorflow.html index f1846820a3..3b9e731756 100644 --- a/v0.6.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.6.0/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.6.0/_modules/doctr/io/pdf.html b/v0.6.0/_modules/doctr/io/pdf.html index e3abf6960b..e5b94811c3 100644 --- a/v0.6.0/_modules/doctr/io/pdf.html +++ b/v0.6.0/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -377,7 +377,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.6.0/_modules/doctr/io/reader.html b/v0.6.0/_modules/doctr/io/reader.html index c1ddc26edd..d36e5bb553 100644 --- a/v0.6.0/_modules/doctr/io/reader.html +++ b/v0.6.0/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.6.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 9f074805c1..61a010d548 100644 --- a/v0.6.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -531,7 +531,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.6.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6a63851276..7c448394ad 100644 --- a/v0.6.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -793,7 +793,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.6.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/resnet/tensorflow.html index 095d377f31..aed4343741 100644 --- a/v0.6.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.6.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.6.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.6.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/vgg/tensorflow.html index 01ae452624..788111ae87 100644 --- a/v0.6.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.6.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/vit/tensorflow.html index 1333cf6045..971ba5abe9 100644 --- a/v0.6.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -533,7 +533,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.6.0/_modules/doctr/models/classification/zoo.html b/v0.6.0/_modules/doctr/models/classification/zoo.html index f7796a7522..3eb2a3ec4e 100644 --- a/v0.6.0/_modules/doctr/models/classification/zoo.html +++ b/v0.6.0/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.6.0/_modules/doctr/models/detection/differentiable_binarization.html b/v0.6.0/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.6.0/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.6.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 9145c7c3fd..66cef8663d 100644 --- a/v0.6.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import List, Tuple, Optional, Any, Dict
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+from tensorflow.keras.applications import ResNet50
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
+
+from ...classification import mobilenet_v3_large
 from .base import DBPostProcessor, _DBNet
 
-__all__ = ['DBNet', 'db_resnet50']
+__all__ = ["DBNet", "db_resnet50", "db_mobilenet_v3_large"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
+    "db_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_resnet50-649fa22b.weights.h5&src=0",
+    },
+    "db_mobilenet_v3_large": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_mobilenet_v3_large-ee2e1dbe.weights.h5&src=0",
     },
 }
 
@@ -313,6 +348,7 @@ 

Source code for doctr.models.detection.differentiable_binarization.tensorflo <https://arxiv.org/pdf/1612.03144.pdf>`_. Args: + ---- channels: number of channel to output """ @@ -322,9 +358,9 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo ) -> None: super().__init__() self.channels = channels - self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest') - self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)] - self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)] + self.upsample = layers.UpSampling2D(size=(2, 2), interpolation="nearest") + self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer="he_normal") for _ in range(4)] + self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2**idx) for idx in range(4)] @staticmethod def build_upsampling( @@ -334,20 +370,21 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo """Module which performs a 3x3 convolution followed by up-sampling Args: + ---- channels: number of output channels dilation_factor (int): dilation factor to scale the convolution output before concatenation Returns: + ------- a keras.layers.Layer object, wrapping these operations in a sequential module """ - - _layers = conv_sequence(channels, 'relu', True, kernel_size=3) + _layers = conv_sequence(channels, "relu", True, kernel_size=3) if dilation_factor > 1: - _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest')) + _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation="nearest")) - module = keras.Sequential(_layers) + module = Sequential(_layers) return module @@ -359,7 +396,6 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo x: List[tf.Tensor], **kwargs: Any, ) -> tf.Tensor: - # Channel mapping results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)] # Upsample & sum @@ -371,200 +407,324 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo return layers.concatenate(results) -class DBNet(_DBNet, keras.Model, NestedObject): +class DBNet(_DBNet, Model, NestedObject): """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_. Args: + ---- feature extractor: the backbone serving as feature extractor fpn_channels: number of channels each extracted feature maps is mapped to + bin_thresh: threshold for binarization + box_thresh: minimal objectness score to consider a box + assume_straight_pages: if True, fit straight bounding boxes only + exportable: onnx exportable returns only logits + cfg: the configuration dict of the model + class_names: list of class names """ - _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "fpn", "probability_head", "threshold_head", "postprocessor"] def __init__( self, feature_extractor: IntermediateLayerGetter, - fpn_channels: int = 128, - rotated_bbox: bool = False, + fpn_channels: int = 128, # to be set to 256 to represent the author's initial idea + bin_thresh: float = 0.3, + box_thresh: float = 0.1, + assume_straight_pages: bool = True, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, + class_names: List[str] = [CLASS_NAME], ) -> None: - super().__init__() + self.class_names = class_names + num_classes: int = len(self.class_names) self.cfg = cfg self.feat_extractor = feature_extractor - self.rotated_bbox = rotated_bbox + self.exportable = exportable + self.assume_straight_pages = assume_straight_pages self.fpn = FeaturePyramidNetwork(channels=fpn_channels) # Initialize kernels _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape] output_shape = tuple(self.fpn(_inputs).shape) - self.probability_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] + self.probability_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + self.threshold_head = Sequential([ + *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + layers.BatchNormalization(), + layers.Activation("relu"), + layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + ]) + + self.postprocessor = DBPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh ) - self.threshold_head = keras.Sequential( - [ - *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), - layers.BatchNormalization(), - layers.Activation('relu'), - layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), - ] - ) - - self.postprocessor = DBPostProcessor(rotated_bbox=rotated_bbox) def compute_loss( self, out_map: tf.Tensor, thresh_map: tf.Tensor, - target: List[Dict[str, Any]] + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes and a list of masks for each image. From there it computes the loss with the model output Args: + ---- out_map: output feature map of the model of shape (N, H, W, C) thresh_map: threshold map of shape (N, H, W, C) target: list of dictionary where each dict has a `boxes` and a `flags` entry + gamma: modulating factor in the focal loss formula + alpha: balancing factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") - prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1])) - thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1])) + prob_map = tf.math.sigmoid(out_map) + thresh_map = tf.math.sigmoid(thresh_map) - seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) + seg_target, seg_mask, thresh_target, thresh_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32) + seg_mask = tf.cast(seg_mask, tf.float32) + thresh_target = tf.convert_to_tensor(thresh_target, dtype=out_map.dtype) thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool) - # Compute balanced BCE loss for proba_map - bce_scale = 5. - bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask] - - neg_target = 1 - seg_target[seg_mask] - positive_count = tf.math.reduce_sum(seg_target[seg_mask]) - negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count]) - negative_loss = bce_loss * neg_target - negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32)) - sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss) - balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6) - - # Compute dice loss for approxbin_map - bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask]))) - - bce_min = tf.math.reduce_min(bce_loss) - weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1. - inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights) - union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8 - dice_loss = 1 - 2.0 * inter / union + # Focal loss + focal_scale = 10.0 + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + + # Convert logits to prob, compute gamma factor + p_t = (seg_target * prob_map) + ((1 - seg_target) * (1 - prob_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class or for approx binary_map + if len(self.class_names) > 1: + dice_map = tf.nn.softmax(out_map, axis=-1) + else: + # compute binary map instead + dice_map = 1.0 / (1.0 + tf.exp(-50 * (prob_map - thresh_map))) + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) # Compute l1 loss for thresh_map - l1_scale = 10. if tf.reduce_any(thresh_mask): - l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask])) + thresh_mask = tf.cast(thresh_mask, tf.float32) + l1_loss = tf.reduce_sum(tf.abs(thresh_map - thresh_target) * thresh_mask) / ( + tf.reduce_sum(thresh_mask) + eps + ) else: - l1_loss = tf.constant(0.) + l1_loss = tf.constant(0.0) - return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss + return l1_loss + focal_scale * focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - feat_maps = self.feat_extractor(x, **kwargs) feat_concat = self.fpn(feat_maps, **kwargs) logits = self.probability_head(feat_concat, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: - # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + if target is None or return_preds: + # Post-process boxes (keep only text predictions) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: thresh_map = self.threshold_head(feat_concat, **kwargs) loss = self.compute_loss(logits, thresh_map, target) - out['loss'] = loss + out["loss"] = loss return out -def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet: +def _db_resnet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) # Feature extractor - resnet = tf.keras.applications.__dict__[_cfg['backbone']]( - include_top=False, - weights=None, - input_shape=_cfg['input_shape'], - pooling=None, + feat_extractor = IntermediateLayerGetter( + backbone_fn( + weights="imagenet" if pretrained_backbone else None, + include_top=False, + pooling=None, + input_shape=_cfg["input_shape"], + ), + fpn_layers, ) + # Build the model + model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + + # Load pretrained parameters + if pretrained: + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) + + return model + + +def _db_mobilenet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> DBNet: + pretrained_backbone = pretrained_backbone and not pretrained + + # Patch the config + _cfg = deepcopy(default_cfgs[arch]) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = default_cfgs[arch].get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor feat_extractor = IntermediateLayerGetter( - resnet, - _cfg['fpn_layers'], + backbone_fn( + input_shape=_cfg["input_shape"], + include_top=False, + pretrained=pretrained_backbone, + ), + fpn_layers, ) - kwargs['fpn_channels'] = _cfg['fpn_channels'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] - # Build the model model = DBNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model
-[docs] +[docs] def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture Returns: + ------- text detection architecture """ + return _db_resnet( + "db_resnet50", + pretrained, + ResNet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
+ + + +
+[docs] +def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet: + """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" + <https://arxiv.org/pdf/1911.08947.pdf>`_, using a mobilenet v3 large backbone. + + >>> import tensorflow as tf + >>> from doctr.models import db_mobilenet_v3_large + >>> model = db_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) - return _db_resnet('db_resnet50', pretrained, **kwargs)
+ Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the DBNet architecture + + Returns: + ------- + text detection architecture + """ + return _db_mobilenet( + "db_mobilenet_v3_large", + pretrained, + mobilenet_v3_large, + ["inverted_2", "inverted_5", "inverted_11", "final_block"], + **kwargs, + )

@@ -598,8 +758,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo - - + + diff --git a/v0.6.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.6.0/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.6.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.6.0/_modules/doctr/models/detection/linknet.html b/v0.6.0/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.6.0/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.6.0/_modules/doctr/models/detection/linknet/tensorflow.html index cd4f446673..ce995f99d4 100644 --- a/v0.6.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.linknet.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
 
 from copy import deepcopy
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
+from typing import Any, Dict, List, Optional, Tuple
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential, layers, losses
+
+from doctr.file_utils import CLASS_NAME
+from doctr.models.classification import resnet18, resnet34, resnet50
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
-from doctr.models.backbones import ResnetStage
-from doctr.models.utils import conv_sequence, load_pretrained_params
-from .base import LinkNetPostProcessor, _LinkNet
 
-__all__ = ['LinkNet', 'linknet16']
+from .base import LinkNetPostProcessor, _LinkNet
 
+__all__ = ["LinkNet", "linknet_resnet18", "linknet_resnet34", "linknet_resnet50"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet16': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'num_classes': 1,
-        'input_shape': (1024, 1024, 3),
-        'rotated_bbox': False,
-        'url': None,
+    "linknet_resnet18": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet18-615a82c5.weights.h5&src=0",
+    },
+    "linknet_resnet34": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet34-9d772be5.weights.h5&src=0",
+    },
+    "linknet_resnet50": {
+        "mean": (0.798, 0.785, 0.772),
+        "std": (0.264, 0.2749, 0.287),
+        "input_shape": (1024, 1024, 3),
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/linknet_resnet50-6bf6c8b5.weights.h5&src=0",
     },
 }
 
 
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
+def decoder_block(in_chan: int, out_chan: int, stride: int, **kwargs: Any) -> Sequential:
     """Creates a LinkNet decoder block"""
-
     return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
+        *conv_sequence(in_chan // 4, "relu", True, kernel_size=1, **kwargs),
         layers.Conv2DTranspose(
             filters=in_chan // 4,
             kernel_size=3,
-            strides=2,
+            strides=stride,
             padding="same",
             use_bias=False,
-            kernel_initializer='he_normal'
+            kernel_initializer="he_normal",
         ),
         layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
+        layers.Activation("relu"),
+        *conv_sequence(out_chan, "relu", True, kernel_size=1),
     ])
 
 
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module"""
+class LinkNetFPN(Model, NestedObject):
+    """LinkNet Decoder module"""
 
     def __init__(
         self,
+        out_chans: int,
+        in_shapes: List[Tuple[int, ...]],
     ) -> None:
-
         super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
+        self.out_chans = out_chans
+        strides = [2] * (len(in_shapes) - 1) + [1]
+        i_chans = [s[-1] for s in in_shapes[::-1]]
+        o_chans = i_chans[1:] + [out_chans]
+        self.decoders = [
+            decoder_block(in_chan, out_chan, s, input_shape=in_shape)
+            for in_chan, out_chan, s, in_shape in zip(i_chans, o_chans, strides, in_shapes[::-1])
+        ]
+
+    def call(self, x: List[tf.Tensor], **kwargs: Any) -> tf.Tensor:
+        out = 0
+        for decoder, fmap in zip(self.decoders, x[::-1]):
+            out = decoder(out + fmap, **kwargs)
+        return out
 
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(_LinkNet, keras.Model):
+    def extra_repr(self) -> str:
+        return f"out_chans={self.out_chans}"
+
+
+class LinkNet(_LinkNet, Model):
     """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
     <https://arxiv.org/pdf/1707.03718.pdf>`_.
 
     Args:
-        num_classes: number of channels for the output
+    ----
+        feature extractor: the backbone serving as feature extractor
+        fpn_channels: number of channels each extracted feature maps is mapped to
+        bin_thresh: threshold for binarization of the output feature map
+        box_thresh: minimal objectness score to consider a box
+        assume_straight_pages: if True, fit straight bounding boxes only
+        exportable: onnx exportable returns only logits
+        cfg: the configuration dict of the model
+        class_names: list of class names
     """
 
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
+    _children_names: List[str] = ["feat_extractor", "fpn", "classifier", "postprocessor"]
 
     def __init__(
         self,
-        num_classes: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        rotated_bbox: bool = False,
+        feat_extractor: IntermediateLayerGetter,
+        fpn_channels: int = 64,
+        bin_thresh: float = 0.1,
+        box_thresh: float = 0.1,
+        assume_straight_pages: bool = True,
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
+        class_names: List[str] = [CLASS_NAME],
     ) -> None:
         super().__init__(cfg=cfg)
 
-        self.rotated_bbox = rotated_bbox
+        self.class_names = class_names
+        num_classes: int = len(self.class_names)
 
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
+        self.exportable = exportable
+        self.assume_straight_pages = assume_straight_pages
+
+        self.feat_extractor = feat_extractor
 
-        self.fpn = LinkNetFPN()
+        self.fpn = LinkNetFPN(fpn_channels, [_shape[1:] for _shape in self.feat_extractor.output_shape])
+        self.fpn.build(self.feat_extractor.output_shape)
 
         self.classifier = Sequential([
             layers.Conv2DTranspose(
@@ -393,154 +442,246 @@ 

Source code for doctr.models.detection.linknet.tensorflow

strides=2, padding="same", use_bias=False, - kernel_initializer='he_normal' + kernel_initializer="he_normal", + input_shape=self.fpn.decoders[-1].output_shape[1:], ), layers.BatchNormalization(), - layers.Activation('relu'), - *conv_sequence(32, 'relu', True, strides=1, kernel_size=3), + layers.Activation("relu"), + *conv_sequence(32, "relu", True, kernel_size=3, strides=1), layers.Conv2DTranspose( filters=num_classes, kernel_size=2, strides=2, padding="same", - use_bias=False, - kernel_initializer='he_normal' + use_bias=True, + kernel_initializer="he_normal", ), ]) - self.postprocessor = LinkNetPostProcessor(rotated_bbox=rotated_bbox) + self.postprocessor = LinkNetPostProcessor( + assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh + ) def compute_loss( self, out_map: tf.Tensor, - target: List[Dict[str, Any]], - focal_loss: bool = False, - alpha: float = .5, - gamma: float = 2., - edge_factor: float = 2., + target: List[Dict[str, np.ndarray]], + gamma: float = 2.0, + alpha: float = 0.5, + eps: float = 1e-8, ) -> tf.Tensor: """Compute linknet loss, BCE with boosted box edges or focal loss. Focal loss implementation based on <https://github.com/tensorflow/addons/>`_. Args: + ---- out_map: output feature map of the model of shape N x H x W x 1 target: list of dictionary where each dict has a `boxes` and a `flags` entry - focal_loss: if True, use focal loss instead of BCE - edge_factor: boost factor for box edges (in case of BCE) + gamma: modulating factor in the focal loss formula alpha: balancing factor in the focal loss formula - gammma: modulating factor in the focal loss formula + eps: epsilon factor in dice loss Returns: + ------- A loss tensor """ - seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) - edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) + seg_target, seg_mask = self.build_target(target, out_map.shape[1:], True) + seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - - # Get the cross_entropy for each entry - bce = tf.keras.losses.binary_crossentropy( - seg_target[seg_mask], - tf.squeeze(out_map, axis=[-1])[seg_mask], - from_logits=True) - - if focal_loss: - if gamma and gamma < 0: - raise ValueError("Value of gamma should be greater than or equal to zero.") - - # Convert logits to prob, compute gamma factor - pred_prob = tf.sigmoid(tf.squeeze(out_map, axis=[-1])[seg_mask]) - p_t = (seg_target[seg_mask] * pred_prob) + ((1 - seg_target[seg_mask]) * (1 - pred_prob)) - modulating_factor = tf.pow((1.0 - p_t), gamma) - - # Compute alpha factor - alpha_factor = seg_target[seg_mask] * alpha + (1 - seg_target[seg_mask]) * (1 - alpha) - - # compute the final loss - loss = tf.reduce_mean(alpha_factor * modulating_factor * bce) - - else: - # Compute BCE loss with highlighted edges - loss = tf.math.multiply( - 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), - bce - ) - loss = tf.reduce_mean(loss) - - return loss + seg_mask = tf.cast(seg_mask, tf.float32) + + bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True) + proba_map = tf.sigmoid(out_map) + + # Focal loss + if gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") + # Convert logits to prob, compute gamma factor + p_t = (seg_target * proba_map) + ((1 - seg_target) * (1 - proba_map)) + alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) + # Unreduced loss + focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss + # Class reduced + focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2, 3)) / tf.reduce_sum(seg_mask, (0, 1, 2, 3)) + + # Compute dice loss for each class + dice_map = tf.nn.softmax(out_map, axis=-1) if len(self.class_names) > 1 else proba_map + # Class-reduced dice loss + inter = tf.reduce_sum(seg_mask * dice_map * seg_target, axis=[0, 1, 2]) + cardinality = tf.reduce_sum(seg_mask * (dice_map + seg_target), axis=[0, 1, 2]) + dice_loss = tf.reduce_mean(1 - 2 * inter / (cardinality + eps)) + + return focal_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[Dict[str, Any]]] = None, + target: Optional[List[Dict[str, np.ndarray]]] = None, return_model_output: bool = False, - return_boxes: bool = False, - focal_loss: bool = True, + return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - - logits = self.stem(x) - logits = self.fpn(logits) - logits = self.classifier(logits) + feat_maps = self.feat_extractor(x, **kwargs) + logits = self.fpn(feat_maps, **kwargs) + logits = self.classifier(logits, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_boxes: - prob_map = tf.math.sigmoid(logits) + if self.exportable: + out["logits"] = logits + return out + + if return_model_output or target is None or return_preds: + prob_map = _bf16_to_float32(tf.math.sigmoid(logits)) + if return_model_output: out["out_map"] = prob_map - if target is None or return_boxes: + if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) + out["preds"] = [dict(zip(self.class_names, preds)) for preds in self.postprocessor(prob_map.numpy())] if target is not None: - loss = self.compute_loss(logits, target, focal_loss) - out['loss'] = loss + loss = self.compute_loss(logits, target) + out["loss"] = loss return out -def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet: +def _linknet( + arch: str, + pretrained: bool, + backbone_fn, + fpn_layers: List[str], + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> LinkNet: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) - _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] + if not kwargs.get("class_names", None): + kwargs["class_names"] = _cfg.get("class_names", [CLASS_NAME]) + else: + kwargs["class_names"] = sorted(kwargs["class_names"]) + + # Feature extractor + feat_extractor = IntermediateLayerGetter( + backbone_fn( + pretrained=pretrained_backbone, + include_top=False, + input_shape=_cfg["input_shape"], + ), + fpn_layers, + ) - kwargs['num_classes'] = _cfg['num_classes'] - kwargs['input_shape'] = _cfg['input_shape'] - kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model - model = LinkNet(cfg=_cfg, **kwargs) + model = LinkNet(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, + _cfg["url"], + skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]), + ) return model -
-[docs] -def linknet16(pretrained: bool = False, **kwargs: Any) -> LinkNet: +
+[docs] +def linknet_resnet18(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet18 + >>> model = linknet_resnet18(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture + + Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet18", + pretrained, + resnet18, + ["resnet_block_1", "resnet_block_3", "resnet_block_5", "resnet_block_7"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet34(pretrained: bool = False, **kwargs: Any) -> LinkNet: """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" <https://arxiv.org/pdf/1707.03718.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet16 - >>> model = linknet16(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet34 + >>> model = linknet_resnet34(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture Returns: + ------- text detection architecture """ + return _linknet( + "linknet_resnet34", + pretrained, + resnet34, + ["resnet_block_2", "resnet_block_6", "resnet_block_12", "resnet_block_15"], + **kwargs, + )
+ + + +
+[docs] +def linknet_resnet50(pretrained: bool = False, **kwargs: Any) -> LinkNet: + """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" + <https://arxiv.org/pdf/1707.03718.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import linknet_resnet50 + >>> model = linknet_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text detection dataset + **kwargs: keyword arguments of the LinkNet architecture - return _linknet('linknet16', pretrained, **kwargs)
+ Returns: + ------- + text detection architecture + """ + return _linknet( + "linknet_resnet50", + pretrained, + resnet50, + ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + **kwargs, + )
@@ -574,8 +715,8 @@

Source code for doctr.models.detection.linknet.tensorflow

- +
+ diff --git a/v0.6.0/_modules/doctr/models/detection/zoo.html b/v0.6.0/_modules/doctr/models/detection/zoo.html index d3128b8d14..3651c4e2d3 100644 --- a/v0.6.0/_modules/doctr/models/detection/zoo.html +++ b/v0.6.0/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.detection.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
 from doctr.file_utils import is_tf_available, is_torch_available
-from .core import DetectionPredictor
-from ..preprocessor import PreProcessor
-from .. import detection
 
+from .. import detection
+from ..detection.fast import reparameterize
+from ..preprocessor import PreProcessor
+from .predictor import DetectionPredictor
 
 __all__ = ["detection_predictor"]
 
+ARCHS: List[str]
+
 
 if is_tf_available():
-    ARCHS = ['db_resnet50', 'linknet16']
+    ARCHS = [
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
 elif is_torch_available():
-    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
+    ARCHS = [
+        "db_resnet34",
+        "db_resnet50",
+        "db_mobilenet_v3_large",
+        "linknet_resnet18",
+        "linknet_resnet34",
+        "linknet_resnet50",
+        "fast_tiny",
+        "fast_small",
+        "fast_base",
+    ]
+
 
+def _predictor(arch: Any, pretrained: bool, assume_straight_pages: bool = True, **kwargs: Any) -> DetectionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
+        _model = detection.__dict__[arch](
+            pretrained=pretrained,
+            pretrained_backbone=kwargs.get("pretrained_backbone", True),
+            assume_straight_pages=assume_straight_pages,
+        )
+        # Reparameterize FAST models by default to lower inference latency and memory usage
+        if isinstance(_model, detection.FAST):
+            _model = reparameterize(_model)
+    else:
+        if not isinstance(arch, (detection.DBNet, detection.LinkNet, detection.FAST)):
+            raise ValueError(f"unknown architecture: {type(arch)}")
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+        _model = arch
+        _model.assume_straight_pages = assume_straight_pages
+        _model.postprocessor.assume_straight_pages = assume_straight_pages
 
-    # Detection
-    _model = detection.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 1)
+    kwargs.pop("pretrained_backbone", None)
+
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 2)
     predictor = DetectionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
-        _model
+        PreProcessor(_model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:], **kwargs),
+        _model,
     )
     return predictor
 
 
 
-[docs] -def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: +[docs] +def detection_predictor( + arch: Any = "fast_base", + pretrained: bool = False, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + batch_size: int = 2, + **kwargs: Any, +) -> DetectionPredictor: """Text detection architecture. - Example:: - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> model = detection_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import detection_predictor + >>> model = detection_predictor(arch='db_resnet50', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_resnet50') + ---- + arch: name of the architecture or model itself to use (e.g. 'db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset + assume_straight_pages: If True, fit straight boxes to the page + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right + batch_size: number of samples the model processes in parallel + **kwargs: optional keyword arguments passed to the architecture Returns: + ------- Detection predictor """ - - return _predictor(arch, pretrained, **kwargs)
+ return _predictor( + arch=arch, + pretrained=pretrained, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + batch_size=batch_size, + **kwargs, + )
@@ -367,8 +449,8 @@

Source code for doctr.models.detection.zoo

       
     
   
- - + + diff --git a/v0.6.0/_modules/doctr/models/export.html b/v0.6.0/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.6.0/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/models/factory/hub.html b/v0.6.0/_modules/doctr/models/factory/hub.html index 8274a809f5..756b2c7a17 100644 --- a/v0.6.0/_modules/doctr/models/factory/hub.html +++ b/v0.6.0/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -568,7 +568,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.6.0/_modules/doctr/models/recognition/crnn.html b/v0.6.0/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.6.0/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.6.0/_modules/doctr/models/recognition/crnn/tensorflow.html index 41cc93dd23..bc64da9a1b 100644 --- a/v0.6.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.crnn.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
+
 import tensorflow as tf
 from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential, Model
-from typing import Tuple, Dict, Any, Optional, List
+from tensorflow.keras.models import Model, Sequential
+
+from doctr.datasets import VOCABS
 
-from ... import backbones
-from ...utils import load_pretrained_params
+from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
+__all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
+    "crnn_vgg16_bn": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["legacy_french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_vgg16_bn-9c188f45.weights.h5&src=0",
     },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
+    "crnn_mobilenet_v3_small": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_small-54850265.weights.h5&src=0",
+    },
+    "crnn_mobilenet_v3_large": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_large-c64045e5.weights.h5&src=0",
     },
 }
 
 
 class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
+    """Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
 
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
         ignore_case: if True, ignore case of letters
         ignore_accents: if True, ignore accents of letters
@@ -325,37 +353,57 @@ 

Source code for doctr.models.recognition.crnn.tensorflow

def __call__( self, - logits: tf.Tensor - ) -> List[Tuple[str, float]]: - """ - Performs decoding of raw output with CTC and decoding of CTC predictions + logits: tf.Tensor, + beam_width: int = 1, + top_paths: int = 1, + ) -> Union[List[Tuple[str, float]], List[Tuple[List[str], List[float]]]]: + """Performs decoding of raw output with CTC and decoding of CTC predictions with label_to_idx mapping dictionnary Args: + ---- logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1 + beam_width: An int scalar >= 0 (beam search beam width). + top_paths: An int scalar >= 0, <= beam_width (controls output size). Returns: + ------- A list of decoded words of length BATCH_SIZE + """ # Decode CTC _decoded, _log_prob = tf.nn.ctc_beam_search_decoder( tf.transpose(logits, perm=[1, 0, 2]), - tf.fill(logits.shape[0], logits.shape[1]), - beam_width=1, top_paths=1, + tf.fill(tf.shape(logits)[:1], tf.shape(logits)[1]), + beam_width=beam_width, + top_paths=top_paths, ) - out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab)) - probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) + + _decoded = tf.sparse.concat( + 1, + [tf.sparse.expand_dims(dec, axis=1) for dec in _decoded], + expand_nonconcat_dims=True, + ) # dim : batchsize x beamwidth x actual_max_len_predictions + out_idxs = tf.sparse.to_dense(_decoded, default_value=len(self.vocab)) # Map it to characters _decoded_strings_pred = tf.strings.reduce_join( inputs=tf.nn.embedding_lookup(tf.constant(self._embedding, dtype=tf.string), out_idxs), - axis=-1 + axis=-1, ) _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] - word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - + decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value="not valid")[ + :, :, 0 + ] # dim : batch_size x beam_width + + if top_paths == 1: + probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) # dim : batchsize + decoded_strings_pred = tf.squeeze(decoded_strings_pred, axis=1) + word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] + else: + probs = tf.math.exp(_log_prob) # dim : batchsize x beamwidth + word_values = [[word.decode() for word in words] for words in decoded_strings_pred.numpy().tolist()] return list(zip(word_values, probs.numpy().tolist())) @@ -364,19 +412,26 @@

Source code for doctr.models.recognition.crnn.tensorflow

Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of units in the LSTM layers + exportable: onnx exportable returns only logits + beam_width: beam width for beam search decoding + top_paths: number of top paths for beam search decoding cfg: configuration dictionary """ - _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "decoder", "postprocessor"] def __init__( self, - feature_extractor: tf.keras.Model, + feature_extractor: Model, vocab: str, rnn_units: int = 128, + exportable: bool = False, + beam_width: int = 1, + top_paths: int = 1, cfg: Optional[Dict[str, Any]] = None, ) -> None: # Initialize kernels @@ -386,19 +441,21 @@

Source code for doctr.models.recognition.crnn.tensorflow

self.vocab = vocab self.max_length = w self.cfg = cfg + self.exportable = exportable self.feat_extractor = feature_extractor - self.decoder = Sequential( - [ - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Dense(units=len(vocab) + 1) - ] - ) + self.decoder = Sequential([ + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), + layers.Dense(units=len(vocab) + 1), + ]) self.decoder.build(input_shape=(None, w, h * c)) self.postprocessor = CTCPostProcessor(vocab=vocab) + self.beam_width = beam_width + self.top_paths = top_paths + def compute_loss( self, model_output: tf.Tensor, @@ -407,16 +464,17 @@

Source code for doctr.models.recognition.crnn.tensorflow

"""Compute CTC loss for the model. Args: - gt: the encoded tensor with gt labels + ---- model_output: predicted logits of the model - seq_len: lengths of each gt word inside the batch + target: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) batch_len = model_output.shape[0] - input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) + input_length = tf.fill((batch_len,), model_output.shape[1]) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -428,8 +486,12 @@

Source code for doctr.models.recognition.crnn.tensorflow

target: Optional[List[str]] = None, return_model_output: bool = False, return_preds: bool = False, + beam_width: int = 1, + top_paths: int = 1, **kwargs: Any, ) -> Dict[str, Any]: + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") features = self.feat_extractor(x, **kwargs) # B x H x W x C --> B x W x H x C @@ -437,91 +499,132 @@

Source code for doctr.models.recognition.crnn.tensorflow

w, h, c = transposed_feat.get_shape().as_list()[1:] # B x W x H x C --> B x W x H * C features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c)) - logits = self.decoder(features_seq, **kwargs) + logits = _bf16_to_float32(self.decoder(features_seq, **kwargs)) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = logits + return out + if return_model_output: out["out_map"] = logits if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(logits) + out["preds"] = self.postprocessor(logits, beam_width=beam_width, top_paths=top_paths) if target is not None: - out['loss'] = self.compute_loss(logits, target) + out["loss"] = self.compute_loss(logits, target) return out -def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: +def _crnn( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> CRNN: + pretrained_backbone = pretrained_backbone and not pretrained + + kwargs["vocab"] = kwargs.get("vocab", default_cfgs[arch]["vocab"]) - # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) + _cfg["vocab"] = kwargs["vocab"] + _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] - # Feature extractor - feat_extractor = backbones.__dict__[_cfg['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + input_shape=_cfg["input_shape"], include_top=False, + pretrained=pretrained_backbone, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - # Build the model model = CRNN(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params(model, _cfg["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]) return model
-[docs] +[docs] def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_vgg16_bn + >>> model = crnn_vgg16_bn(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_vgg16_bn", pretrained, vgg16_bn_r, **kwargs)
+ + + +
+[docs] +def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based + Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_small + >>> model = crnn_mobilenet_v3_small(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
+ Returns: + ------- + text recognition architecture + """ + return _crnn("crnn_mobilenet_v3_small", pretrained, mobilenet_v3_small_r, **kwargs)
-def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based +
+[docs] +def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import crnn_mobilenet_v3_large + >>> model = crnn_mobilenet_v3_large(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the CRNN architecture Returns: + ------- text recognition architecture """ + return _crnn("crnn_mobilenet_v3_large", pretrained, mobilenet_v3_large_r, **kwargs)
- return _crnn('crnn_resnet31', pretrained, **kwargs)
@@ -554,8 +657,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

- +
+ diff --git a/v0.6.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.6.0/_modules/doctr/models/recognition/master/tensorflow.html index 2dc5a27717..aa6aa69325 100644 --- a/v0.6.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.master.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import tensorflow as tf
-from tensorflow.keras import layers, Sequential, Model
-from typing import Tuple, List, Dict, Any, Optional
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
 
-from ..core import RecognitionPostProcessor
-from ...backbones.resnet import ResnetStage
-from ...utils import conv_sequence, load_pretrained_params
-from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask
-from ....datasets import VOCABS
-from .base import _MASTER, _MASTERPostProcessor
+import tensorflow as tf
+from tensorflow.keras import Model, layers
+
+from doctr.datasets import VOCABS
+from doctr.models.classification import magc_resnet31
+from doctr.models.modules.transformer import Decoder, PositionalEncoding
 
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from .base import _MASTER, _MASTERPostProcessor
 
-__all__ = ['MASTER', 'master', 'MASTERPostProcessor']
+__all__ = ["MASTER", "master"]
 
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'master': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'input_shape': (48, 160, 3),
-        'vocab': VOCABS['french'],
-        'url': None,
+    "master": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/master-d7fdaeff.weights.h5&src=0",
     },
 }
 
 
-class MAGC(layers.Layer):
-
-    """Implements the Multi-Aspect Global Context Attention, as described in
-    <https://arxiv.org/pdf/1910.02562.pdf>`_.
-
-    Args:
-        inplanes: input channels
-        headers: number of headers to split channels
-        att_scale: if True, re-scale attention to counteract the variance distibutions
-        **kwargs
-    """
-
-    def __init__(
-        self,
-        inplanes: int,
-        headers: int = 1,
-        att_scale: bool = False,
-        **kwargs
-    ) -> None:
-        super().__init__(**kwargs)
-
-        self.headers = headers  # h
-        self.inplanes = inplanes  # C
-        self.att_scale = att_scale
-
-        self.single_header_inplanes = int(inplanes / headers)  # C / h
-
-        self.conv_mask = tf.keras.layers.Conv2D(
-            filters=1,
-            kernel_size=1,
-            kernel_initializer=tf.initializers.he_normal()
-        )
-
-        self.transform = tf.keras.Sequential(
-            [
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-                tf.keras.layers.LayerNormalization([1, 2, 3]),
-                tf.keras.layers.ReLU(),
-                tf.keras.layers.Conv2D(
-                    filters=self.inplanes,
-                    kernel_size=1,
-                    kernel_initializer=tf.initializers.he_normal()
-                ),
-            ],
-            name='transform'
-        )
-
-    @tf.function
-    def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor:
-        b, h, w, c = (tf.shape(inputs)[i] for i in range(4))
-
-        # B, H, W, C -->> B*h, H, W, C/h
-        x = tf.reshape(inputs, shape=(b, h, w, self.headers, self.single_header_inplanes))
-        x = tf.transpose(x, perm=(0, 3, 1, 2, 4))
-        x = tf.reshape(x, shape=(b * self.headers, h, w, self.single_header_inplanes))
-
-        # Compute shorcut
-        shortcut = x
-        # B*h, 1, H*W, C/h
-        shortcut = tf.reshape(shortcut, shape=(b * self.headers, 1, h * w, self.single_header_inplanes))
-        # B*h, 1, C/h, H*W
-        shortcut = tf.transpose(shortcut, perm=[0, 1, 3, 2])
-
-        # Compute context mask
-        # B*h, H, W, 1,
-        context_mask = self.conv_mask(x)
-        # B*h, 1, H*W, 1
-        context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1))
-        # scale variance
-        if self.att_scale and self.headers > 1:
-            context_mask = context_mask / tf.sqrt(self.single_header_inplanes)
-        # B*h, 1, H*W, 1
-        context_mask = tf.keras.activations.softmax(context_mask, axis=2)
-
-        # Compute context
-        # B*h, 1, C/h, 1
-        context = tf.matmul(shortcut, context_mask)
-        context = tf.reshape(context, shape=(b, 1, c, 1))
-        # B, 1, 1, C
-        context = tf.transpose(context, perm=(0, 1, 3, 2))
-        # Set shape to resolve shape when calling this module in the Sequential MAGCResnet
-        batch, chan = inputs.get_shape().as_list()[0], inputs.get_shape().as_list()[-1]
-        context.set_shape([batch, 1, 1, chan])
-        return context
-
-    def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
-        # Context modeling: B, H, W, C  ->  B, 1, 1, C
-        context = self.context_modeling(inputs)
-        # Transform: B, 1, 1, C  ->  B, 1, 1, C
-        transformed = self.transform(context)
-        return inputs + transformed
-
-
-class MAGCResnet(Sequential):
-
-    """Implements the modified resnet with MAGC layers, as described in paper.
-
-    Args:
-        headers: number of header to split channels in MAGC layers
-        input_shape: shape of the model input (without batch dim)
-    """
-
-    def __init__(
-        self,
-        headers: int = 1,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
-    ) -> None:
-        _layers = [
-            # conv_1x
-            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
-            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_2x
-            ResnetStage(num_blocks=1, output_channels=256),
-            MAGC(inplanes=256, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=256, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 2), (2, 2)),
-            # conv_3x
-            ResnetStage(num_blocks=2, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            layers.MaxPooling2D((2, 1), (2, 1)),
-            # conv_4x
-            ResnetStage(num_blocks=5, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-            # conv_5x
-            ResnetStage(num_blocks=3, output_channels=512),
-            MAGC(inplanes=512, headers=headers, att_scale=True),
-            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
-        ]
-        super().__init__(_layers)
-
-
 class MASTER(_MASTER, Model):
-
     """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
     Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_.
 
     Args:
+    ----
+        feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary, (without EOS, SOS, PAD)
         d_model: d parameter for the transformer decoder
-        headers: headers for the MAGC module
         dff: depth of the pointwise feed-forward layer
         num_heads: number of heads for the mutli-head attention module
         num_layers: number of decoder layers to stack
         max_length: maximum length of character sequence handled by the model
-        input_size: size of the image inputs
+        dropout: dropout probability of the decoder
+        input_shape: size of the image inputs
+        exportable: onnx exportable returns only logits
+        cfg: dictionary containing information about the model
     """
 
     def __init__(
         self,
+        feature_extractor: Model,
         vocab: str,
         d_model: int = 512,
-        headers: int = 1,
         dff: int = 2048,
-        num_heads: int = 8,
+        num_heads: int = 8,  # number of heads in the transformer decoder
         num_layers: int = 3,
         max_length: int = 50,
-        input_shape: Tuple[int, int, int] = (48, 160, 3),
+        dropout: float = 0.2,
+        input_shape: Tuple[int, int, int] = (32, 128, 3),  # different from the paper
+        exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__()
 
-        self.vocab = vocab
+        self.exportable = exportable
         self.max_length = max_length
+        self.d_model = d_model
+        self.vocab = vocab
         self.cfg = cfg
         self.vocab_size = len(vocab)
 
-        self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape)
-        self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model)  # 3 more classes: EOS/PAD/SOS
+        self.feat_extractor = feature_extractor
+        self.positional_encoding = PositionalEncoding(self.d_model, dropout, max_len=input_shape[0] * input_shape[1])
 
         self.decoder = Decoder(
             num_layers=num_layers,
-            d_model=d_model,
+            d_model=self.d_model,
             num_heads=num_heads,
+            vocab_size=self.vocab_size + 3,  # EOS, SOS, PAD
             dff=dff,
-            vocab_size=self.vocab_size,
-            maximum_position_encoding=max_length,
+            dropout=dropout,
+            maximum_position_encoding=self.max_length,
         )
-        self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model)
-        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
 
+        self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
         self.postprocessor = MASTERPostProcessor(vocab=self.vocab)
 
     @tf.function
-    def make_mask(self, target: tf.Tensor) -> tf.Tensor:
-        look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
-        target_padding_mask = create_padding_mask(target, self.vocab_size + 2)  # Pad symbol
-        combined_mask = tf.maximum(target_padding_mask, look_ahead_mask)
-        return combined_mask
+    def make_source_and_target_mask(self, source: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+        # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
+        # (N, 1, 1, max_length)
+        target_pad_mask = tf.cast(tf.math.not_equal(target, self.vocab_size + 2), dtype=tf.uint8)
+        target_pad_mask = target_pad_mask[:, tf.newaxis, tf.newaxis, :]
+        target_length = target.shape[1]
+        # sub mask filled diagonal with 1 = see 0 = masked (max_length, max_length)
+        target_sub_mask = tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
+        # source mask filled with ones (max_length, positional_encoded_seq_len)
+        source_mask = tf.ones((target_length, source.shape[1]))
+        # combine the two masks into one boolean mask where False is masked (N, 1, max_length, max_length)
+        target_mask = tf.math.logical_and(
+            tf.cast(target_sub_mask, dtype=tf.bool), tf.cast(target_pad_mask, dtype=tf.bool)
+        )
+        return source_mask, target_mask
 
+    @staticmethod
     def compute_loss(
-        self,
         model_output: tf.Tensor,
         gt: tf.Tensor,
         seq_len: List[int],
@@ -512,11 +413,13 @@ 

Source code for doctr.models.recognition.master.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -532,7 +435,7 @@

Source code for doctr.models.recognition.master.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) @@ -547,94 +450,103 @@

Source code for doctr.models.recognition.master.tensorflow

"""Call function for training Args: + ---- x: images target: list of str labels return_model_output: if True, return logits return_preds: if True, decode logits + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A dictionnary containing eventually loss, logits and predictions. """ - # Encode - feature = self.feature_extractor(x, **kwargs) - b, h, w, c = (tf.shape(feature)[i] for i in range(4)) + feature = self.feat_extractor(x, **kwargs) + b, h, w, c = feature.get_shape() + # (N, H, W, C) --> (N, H * W, C) feature = tf.reshape(feature, shape=(b, h * w, c)) - encoded = feature + self.feature_pe[:, :h * w, :] + # add positional encoding to features + encoded = self.positional_encoding(feature, **kwargs) out: Dict[str, tf.Tensor] = {} + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training") + if target is not None: # Compute target: tensor of gts and sequence lengths - gt, seq_len = self.compute_target(target) - - if kwargs.get('training', False): - if target is None: - raise AssertionError("In training mode, you need to pass a value to 'target'") - tgt_mask = self.make_mask(gt) + gt, seq_len = self.build_target(target) + # Compute decoder masks + source_mask, target_mask = self.make_source_and_target_mask(encoded, gt) # Compute logits - output = self.decoder(gt, encoded, tgt_mask, None, **kwargs) + output = self.decoder(gt, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) - else: - # When not training, we want to compute logits in with the decoder, although - # we have access to gts (we need gts to compute the loss, but not in the decoder) logits = self.decode(encoded, **kwargs) + logits = _bf16_to_float32(logits) + + if self.exportable: + out["logits"] = logits + return out + if target is not None: - out['loss'] = self.compute_loss(logits, gt, seq_len) + out["loss"] = self.compute_loss(logits, gt, seq_len) if return_model_output: - out['out_map'] = logits + out["out_map"] = logits if return_preds: - predictions = self.postprocessor(logits) - out['preds'] = predictions + out["preds"] = self.postprocessor(logits) return out + @tf.function def decode(self, encoded: tf.Tensor, **kwargs: Any) -> tf.Tensor: """Decode function for prediction Args: + ---- encoded: encoded features + **kwargs: keyword arguments passed to the decoder - Return: + Returns: + ------- A Tuple of tf.Tensor: predictions, logits """ - b = tf.shape(encoded)[0] - max_len = tf.constant(self.max_length, dtype=tf.int32) + b = encoded.shape[0] + start_symbol = tf.constant(self.vocab_size + 1, dtype=tf.int32) # SOS padding_symbol = tf.constant(self.vocab_size + 2, dtype=tf.int32) # PAD - ys = tf.fill(dims=(b, max_len - 1), value=padding_symbol) + ys = tf.fill(dims=(b, self.max_length - 1), value=padding_symbol) start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols - # max_len = len + 2 (sos + eos) + # Final dimension include EOS/SOS/PAD for i in range(self.max_length - 1): - ys_mask = self.make_mask(ys) - output = self.decoder(ys, encoded, ys_mask, None, **kwargs) + source_mask, target_mask = self.make_source_and_target_mask(encoded, ys) + output = self.decoder(ys, encoded, source_mask, target_mask, **kwargs) logits = self.linear(output, **kwargs) prob = tf.nn.softmax(logits, axis=-1) - next_word = tf.argmax(prob, axis=-1, output_type=ys.dtype) - # ys.shape = B, T - i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(max_len), indexing='ij') + next_token = tf.argmax(prob, axis=-1, output_type=ys.dtype) + # update ys with the next token and ignore the first token (SOS) + i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(self.max_length), indexing="ij") indices = tf.stack([i_mesh[:, i + 1], j_mesh[:, i + 1]], axis=1) - ys = tf.tensor_scatter_nd_update(ys, indices, next_word[:, i + 1]) + ys = tf.tensor_scatter_nd_update(ys, indices, next_token[:, i]) - # final_logits of shape (N, max_length - 1, vocab_size + 1) (whithout sos) + # Shape (N, max_length, vocab_size + 1) return logits class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures + Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -649,51 +561,66 @@

Source code for doctr.models.recognition.master.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _master(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> MASTER: +def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool = True, **kwargs: Any) -> MASTER: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"]) + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) - kwargs['vocab'] = _cfg['vocab'] + kwargs["vocab"] = _cfg["vocab"] + kwargs["input_shape"] = _cfg["input_shape"] # Build the model - model = MASTER(cfg=_cfg, **kwargs) + model = MASTER( + backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg["input_shape"], include_top=False), + cfg=_cfg, + **kwargs, + ) + _build_model(model) + # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model
-[docs] +[docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - Example:: - >>> import tensorflow as tf - >>> from doctr.models import master - >>> model = master(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + + >>> import tensorflow as tf + >>> from doctr.models import master + >>> model = master(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keywoard arguments passed to the MASTER architecture + Returns: + ------- text recognition architecture """ - - return _master('master', pretrained, **kwargs)
+ return _master("master", pretrained, magc_resnet31, **kwargs)
@@ -727,8 +654,8 @@

Source code for doctr.models.recognition.master.tensorflow

- +
+ diff --git a/v0.6.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.6.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 0819737dfc..b181acef53 100644 --- a/v0.6.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -845,7 +845,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.6.0/_modules/doctr/models/recognition/sar.html b/v0.6.0/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.6.0/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.6.0/_modules/doctr/models/recognition/sar/tensorflow.html index e514e4f0c4..4a591e6451 100644 --- a/v0.6.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.sar.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple
+
 import tensorflow as tf
-from tensorflow.keras import Sequential, layers, Model
-from typing import Tuple, Dict, List, Any, Optional
+from tensorflow.keras import Model, Sequential, layers
 
-from ... import backbones
-from ...utils import load_pretrained_params
-from ..core import RecognitionModel, RecognitionPostProcessor
+from doctr.datasets import VOCABS
 from doctr.utils.repr import NestedObject
 
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
+from ...classification import resnet31
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
+from ..core import RecognitionModel, RecognitionPostProcessor
+
+__all__ = ["SAR", "sar_resnet31"]
 
 default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
+    "sar_resnet31": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (32, 128, 3),
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/sar_resnet31-5a58806c.weights.h5&src=0",
     },
 }
 
 
+class SAREncoder(layers.Layer, NestedObject):
+    """Implements encoder module of the SAR model
+
+    Args:
+    ----
+        rnn_units: number of hidden rnn units
+        dropout_prob: dropout probability
+    """
+
+    def __init__(self, rnn_units: int, dropout_prob: float = 0.0) -> None:
+        super().__init__()
+        self.rnn = Sequential([
+            layers.LSTM(units=rnn_units, return_sequences=True, recurrent_dropout=dropout_prob),
+            layers.LSTM(units=rnn_units, return_sequences=False, recurrent_dropout=dropout_prob),
+        ])
+
+    def call(
+        self,
+        x: tf.Tensor,
+        **kwargs: Any,
+    ) -> tf.Tensor:
+        # (N, C)
+        return self.rnn(x, **kwargs)
+
+
 class AttentionModule(layers.Layer, NestedObject):
     """Implements attention module of the SAR model
 
     Args:
+    ----
         attention_units: number of hidden attention units
 
     """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
 
+    def __init__(self, attention_units: int) -> None:
         super().__init__()
         self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
+            attention_units,
+            3,
+            strides=1,
+            use_bias=True,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
+            1,
+            1,
+            strides=1,
+            use_bias=False,
+            padding="same",
+            kernel_initializer="he_normal",
         )
         self.flatten = layers.Flatten()
 
@@ -343,12 +395,12 @@ 

Source code for doctr.models.recognition.sar.tensorflow

hidden_state: tf.Tensor, **kwargs: Any, ) -> tf.Tensor: - [H, W] = features.get_shape().as_list()[1:3] - # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) - hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) # shape (N, H, W, vgg_units) -> (N, H, W, attention_units) features_projection = self.features_projector(features, **kwargs) + # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) + hidden_state = tf.expand_dims(tf.expand_dims(hidden_state, axis=1), axis=1) + hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) projection = tf.math.tanh(hidden_state_projection + features_projection) # shape (N, H, W, attention_units) -> (N, H, W, 1) attention = self.attention_projector(projection, **kwargs) @@ -358,23 +410,25 @@

Source code for doctr.models.recognition.sar.tensorflow

# shape (N, H * W) -> (N, H, W, 1) attention_map = tf.reshape(attention, [-1, H, W, 1]) glimpse = tf.math.multiply(features, attention_map) - # shape (N, H * W) -> (N, 1) - glimpse = tf.reduce_sum(glimpse, axis=[1, 2]) - return glimpse + # shape (N, H * W) -> (N, C) + return tf.reduce_sum(glimpse, axis=[1, 2]) class SARDecoder(layers.Layer, NestedObject): """Implements decoder module of the SAR model Args: + ---- rnn_units: number of hidden units in recurrent cells max_length: maximum length of a sequence vocab_size: number of classes in the model alphabet embedding_units: number of hidden embedding units attention_units: number of hidden attention units - num_decoder_layers: number of LSTM layers to stack + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability """ + def __init__( self, rnn_units: int, @@ -382,23 +436,22 @@

Source code for doctr.models.recognition.sar.tensorflow

vocab_size: int, embedding_units: int, attention_units: int, - num_decoder_layers: int = 2, - input_shape: Optional[List[Tuple[Optional[int]]]] = None, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, ) -> None: - super().__init__() self.vocab_size = vocab_size - self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] - ) - self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) - self.attention_module = AttentionModule(attention_units) - self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units)) self.max_length = max_length - # Initialize kernels - if input_shape is not None: - self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units))) + self.embed = layers.Dense(embedding_units, use_bias=False) + self.embed_tgt = layers.Embedding(embedding_units, self.vocab_size + 1) + + self.lstm_cells = layers.StackedRNNCells([ + layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_cells) + ]) + self.attention_module = AttentionModule(attention_units) + self.output_dense = layers.Dense(self.vocab_size + 1, use_bias=True) + self.dropout = layers.Dropout(dropout_prob) def call( self, @@ -407,40 +460,47 @@

Source code for doctr.models.recognition.sar.tensorflow

gt: Optional[tf.Tensor] = None, **kwargs: Any, ) -> tf.Tensor: - - # initialize states (each of shape (N, rnn_units)) - states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=tf.float32 - ) - # run first step of lstm - # holistic: shape (N, rnn_units) - _, states = self.lstm_decoder(holistic, states, **kwargs) - # Initialize with the index of virtual START symbol (placed after <eos>) - symbol = tf.fill(features.shape[0], self.vocab_size + 1) - logits_list = [] - if kwargs.get('training') and gt is None: - raise ValueError('Need to provide labels during training for teacher forcing') - for t in range(self.max_length + 1): # keep 1 step for <eos> - # one-hot symbol with depth vocab_size + 1 - # embeded_symbol: shape (N, embedding_units) - embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs) - logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs) - glimpse = self.attention_module( - features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs, - ) - # logits: shape (N, rnn_units), glimpse: shape (N, 1) - logits = tf.concat([logits, glimpse], axis=-1) - # shape (N, rnn_units + 1) -> (N, vocab_size + 1) - logits = self.output_dense(logits, **kwargs) - # update symbol with predicted logits for t+1 step - if kwargs.get('training'): - symbol = gt[:, t] # type: ignore[index] + if gt is not None: + gt_embedding = self.embed_tgt(gt, **kwargs) + + logits_list: List[tf.Tensor] = [] + + for t in range(self.max_length + 1): # 32 + if t == 0: + # step to init the first states of the LSTMCell + states = self.lstm_cells.get_initial_state( + inputs=None, batch_size=features.shape[0], dtype=features.dtype + ) + prev_symbol = holistic + elif t == 1: + # step to init a 'blank' sequence of length vocab_size + 1 filled with zeros + # (N, vocab_size + 1) --> (N, embedding_units) + prev_symbol = tf.zeros([features.shape[0], self.vocab_size + 1], dtype=features.dtype) + prev_symbol = self.embed(prev_symbol, **kwargs) else: - symbol = tf.argmax(logits, axis=-1) - logits_list.append(logits) - outputs = tf.stack(logits_list, axis=1) # shape (N, max_length + 1, vocab_size + 1) - - return outputs + if gt is not None and kwargs.get("training", False): + # (N, embedding_units) -2 because of <bos> and <eos> (same) + prev_symbol = self.embed(gt_embedding[:, t - 2], **kwargs) + else: + # -1 to start at timestep where prev_symbol was initialized + index = tf.argmax(logits_list[t - 1], axis=-1) + # update prev_symbol with ones at the index of the previous logit vector + prev_symbol = self.embed(self.embed_tgt(index, **kwargs), **kwargs) + + # (N, C), (N, C) take the last hidden state and cell state from current timestep + _, states = self.lstm_cells(prev_symbol, states, **kwargs) + # states = (hidden_state, cell_state) + hidden_state = states[0][0] + # (N, H, W, C), (N, C) --> (N, C) + glimpse = self.attention_module(features, hidden_state, **kwargs) + # (N, C), (N, C) --> (N, 2 * C) + logits = tf.concat([hidden_state, glimpse], axis=1) + logits = self.dropout(logits, **kwargs) + # (N, vocab_size + 1) + logits_list.append(self.output_dense(logits, **kwargs)) + + # (max_length + 1, N, vocab_size + 1) --> (N, max_length + 1, vocab_size + 1) + return tf.transpose(tf.stack(logits_list[1:]), (1, 0, 2)) class SAR(Model, RecognitionModel): @@ -448,17 +508,20 @@

Source code for doctr.models.recognition.sar.tensorflow

Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of hidden units in both encoder and decoder LSTM embedding_units: number of embedding units attention_units: number of hidden units in attention module max_length: maximum word length handled by the model - num_decoders: number of LSTM to stack in decoder layer - + num_decoder_cells: number of LSTMCell layers to stack + dropout_prob: dropout probability for the encoder and decoder + exportable: onnx exportable returns only logits + cfg: dictionary containing information about the model """ - _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor'] + _children_names: List[str] = ["feat_extractor", "encoder", "decoder", "postprocessor"] def __init__( self, @@ -468,36 +531,34 @@

Source code for doctr.models.recognition.sar.tensorflow

embedding_units: int = 512, attention_units: int = 512, max_length: int = 30, - num_decoders: int = 2, + num_decoder_cells: int = 2, + dropout_prob: float = 0.0, + exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: - super().__init__() self.vocab = vocab + self.exportable = exportable self.cfg = cfg - self.max_length = max_length + 1 # Add 1 timestep for EOS after the longest word self.feat_extractor = feature_extractor - self.encoder = Sequential( - [ - layers.LSTM(units=rnn_units, return_sequences=True), - layers.LSTM(units=rnn_units, return_sequences=False) - ] - ) - # Initialize the kernels (watch out for reduce_max) - self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:]) - + self.encoder = SAREncoder(rnn_units, dropout_prob) self.decoder = SARDecoder( - rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders, - input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape] + rnn_units, + self.max_length, + len(vocab), + embedding_units, + attention_units, + num_decoder_cells, + dropout_prob, ) self.postprocessor = SARPostProcessor(vocab=vocab) + @staticmethod def compute_loss( - self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: tf.Tensor, @@ -506,11 +567,13 @@

Source code for doctr.models.recognition.sar.tensorflow

Sequences are masked after the EOS character. Args: + ---- gt: the encoded tensor with gt labels model_output: predicted logits of the model seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of timesteps @@ -525,7 +588,7 @@

Source code for doctr.models.recognition.sar.tensorflow

mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -536,16 +599,28 @@

Source code for doctr.models.recognition.sar.tensorflow

return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - features = self.feat_extractor(x, **kwargs) - pooled_features = tf.reduce_max(features, axis=1) # vertical max pooling + # vertical max pooling --> (N, C, W) + pooled_features = tf.reduce_max(features, axis=1) + # holistic (N, C) encoded = self.encoder(pooled_features, **kwargs) + if target is not None: - gt, seq_len = self.compute_target(target) + gt, seq_len = self.build_target(target) seq_len = tf.cast(seq_len, tf.int32) - decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + + if kwargs.get("training", False) and target is None: + raise ValueError("Need to provide labels during training for teacher forcing") + + decoded_features = _bf16_to_float32( + self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) + ) out: Dict[str, tf.Tensor] = {} + if self.exportable: + out["logits"] = decoded_features + return out + if return_model_output: out["out_map"] = decoded_features @@ -554,7 +629,7 @@

Source code for doctr.models.recognition.sar.tensorflow

out["preds"] = self.postprocessor(decoded_features) if target is not None: - out['loss'] = self.compute_loss(decoded_features, gt, seq_len) + out["loss"] = self.compute_loss(decoded_features, gt, seq_len) return out @@ -563,9 +638,8 @@

Source code for doctr.models.recognition.sar.tensorflow

"""Post processor for SAR architectures Args: + ---- vocab: string containing the ordered sequence of supported characters - ignore_case: if True, ignore case of letters - ignore_accents: if True, ignore accents of letters """ def __call__( @@ -580,95 +654,75 @@

Source code for doctr.models.recognition.sar.tensorflow

probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype='int32') + out_idxs = tf.cast(out_idxs, dtype="int32") embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + return list(zip(word_values, probs.numpy().clip(0, 1).tolist())) -def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: +def _sar( + arch: str, + pretrained: bool, + backbone_fn, + pretrained_backbone: bool = True, + input_shape: Optional[Tuple[int, int, int]] = None, + **kwargs: Any, +) -> SAR: + pretrained_backbone = pretrained_backbone and not pretrained # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) - _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units']) - _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units']) - _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length']) - _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) + _cfg["input_shape"] = input_shape or _cfg["input_shape"] + _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) # Feature extractor - feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( - input_shape=_cfg['input_shape'], + feat_extractor = backbone_fn( + pretrained=pretrained_backbone, + input_shape=_cfg["input_shape"], include_top=False, ) - kwargs['vocab'] = _cfg['vocab'] - kwargs['rnn_units'] = _cfg['rnn_units'] - kwargs['embedding_units'] = _cfg['embedding_units'] - kwargs['attention_units'] = _cfg['attention_units'] - kwargs['max_length'] = _cfg['max_length'] - kwargs['num_decoders'] = _cfg['num_decoders'] + kwargs["vocab"] = _cfg["vocab"] # Build the model model = SAR(feat_extractor, cfg=_cfg, **kwargs) + _build_model(model) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model -
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - -
-[docs] +[docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + >>> import tensorflow as tf + >>> from doctr.models import sar_resnet31 + >>> model = sar_resnet31(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the SAR architecture Returns: + ------- text recognition architecture """ - - return _sar('sar_resnet31', pretrained, **kwargs)
+ return _sar("sar_resnet31", pretrained, resnet31, **kwargs)
@@ -702,8 +756,8 @@

Source code for doctr.models.recognition.sar.tensorflow

- +
+ diff --git a/v0.6.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.6.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 6e101893bf..c594d40a56 100644 --- a/v0.6.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -621,7 +621,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.6.0/_modules/doctr/models/recognition/zoo.html b/v0.6.0/_modules/doctr/models/recognition/zoo.html index bf0ae6af6e..f664304019 100644 --- a/v0.6.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.6.0/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.models.recognition.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Any
+from typing import Any, List
 
-from doctr.file_utils import is_tf_available, is_torch_available
-from .core import RecognitionPredictor
-from ..preprocessor import PreProcessor
-from .. import recognition
+from doctr.file_utils import is_tf_available
+from doctr.models.preprocessor import PreProcessor
 
+from .. import recognition
+from .predictor import RecognitionPredictor
 
 __all__ = ["recognition_predictor"]
 
 
-if is_tf_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
-elif is_torch_available():
-    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
+ARCHS: List[str] = [
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "sar_resnet31",
+    "master",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
+]
+
 
+def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+    if isinstance(arch, str):
+        if arch not in ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
 
-def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
+        _model = recognition.__dict__[arch](
+            pretrained=pretrained, pretrained_backbone=kwargs.get("pretrained_backbone", True)
+        )
+    else:
+        if not isinstance(
+            arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq)
+        ):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+        _model = arch
 
-    if arch not in ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+    kwargs.pop("pretrained_backbone", None)
 
-    _model = recognition.__dict__[arch](pretrained=pretrained)
-    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
-    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
-    kwargs['batch_size'] = kwargs.get('batch_size', 32)
-    predictor = RecognitionPredictor(
-        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
-        _model
-    )
+    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
+    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
+    kwargs["batch_size"] = kwargs.get("batch_size", 128)
+    input_shape = _model.cfg["input_shape"][:2] if is_tf_available() else _model.cfg["input_shape"][-2:]
+    predictor = RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs), _model)
 
     return predictor
 
 
 
-[docs] -def recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor: +[docs] +def recognition_predictor( + arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + symmetric_pad: bool = False, + batch_size: int = 128, + **kwargs: Any, +) -> RecognitionPredictor: """Text recognition architecture. Example:: @@ -326,14 +369,18 @@

Source code for doctr.models.recognition.zoo

        >>> out = model([input_page])
 
     Args:
-        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
+    ----
+        arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
         pretrained: If True, returns a model pre-trained on our text recognition dataset
+        symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right
+        batch_size: number of samples the model processes in parallel
+        **kwargs: optional parameters to be passed to the architecture
 
     Returns:
+    -------
         Recognition predictor
     """
-
-    return _predictor(arch, pretrained, **kwargs)
+ return _predictor(arch=arch, pretrained=pretrained, symmetric_pad=symmetric_pad, batch_size=batch_size, **kwargs)
@@ -367,8 +414,8 @@

Source code for doctr.models.recognition.zoo

   
-
- +
+ diff --git a/v0.6.0/_modules/doctr/models/zoo.html b/v0.6.0/_modules/doctr/models/zoo.html index dec6857019..d459671648 100644 --- a/v0.6.0/_modules/doctr/models/zoo.html +++ b/v0.6.0/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -225,15 +225,42 @@

Source code for doctr.models.zoo

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 from typing import Any
-from .core import OCRPredictor
+
 from .detection.zoo import detection_predictor
+from .kie_predictor import KIEPredictor
+from .predictor import OCRPredictor
 from .recognition.zoo import recognition_predictor
 
+__all__ = ["ocr_predictor", "kie_predictor"]
 
-__all__ = ["ocr_predictor"]
-
-
-def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
 
+def _predictor(
+    det_arch: Any,
+    reco_arch: Any,
+    pretrained: bool,
+    pretrained_backbone: bool = True,
+    assume_straight_pages: bool = True,
+    preserve_aspect_ratio: bool = True,
+    symmetric_pad: bool = True,
+    det_bs: int = 2,
+    reco_bs: int = 128,
+    detect_orientation: bool = False,
+    straighten_pages: bool = False,
+    detect_language: bool = False,
+    **kwargs,
+) -> OCRPredictor:
     # Detection
-    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
+    det_predictor = detection_predictor(
+        det_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=det_bs,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+    )
 
     # Recognition
-    reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
+    reco_predictor = recognition_predictor(
+        reco_arch,
+        pretrained=pretrained,
+        pretrained_backbone=pretrained_backbone,
+        batch_size=reco_bs,
+    )
 
-    return OCRPredictor(det_predictor, reco_predictor)
+    return OCRPredictor(
+        det_predictor,
+        reco_predictor,
+        assume_straight_pages=assume_straight_pages,
+        preserve_aspect_ratio=preserve_aspect_ratio,
+        symmetric_pad=symmetric_pad,
+        detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
+        detect_language=detect_language,
+        **kwargs,
+    )
 
 
 
-[docs] +[docs] def ocr_predictor( - det_arch: str = 'db_resnet50', - reco_arch: str = 'crnn_vgg16_bn', + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", pretrained: bool = False, - **kwargs: Any + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. - Example:: - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor(pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet') + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` Returns: + ------- OCR predictor """ + return _predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
+ + - return _predictor(det_arch, reco_arch, pretrained, **kwargs)
+def _kie_predictor( + det_arch: Any, + reco_arch: Any, + pretrained: bool, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + det_bs: int = 2, + reco_bs: int = 128, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs, +) -> KIEPredictor: + # Detection + det_predictor = detection_predictor( + det_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=det_bs, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + ) + + # Recognition + reco_predictor = recognition_predictor( + reco_arch, + pretrained=pretrained, + pretrained_backbone=pretrained_backbone, + batch_size=reco_bs, + ) + + return KIEPredictor( + det_predictor, + reco_predictor, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + ) + + +
+[docs] +def kie_predictor( + det_arch: Any = "fast_base", + reco_arch: Any = "crnn_vgg16_bn", + pretrained: bool = False, + pretrained_backbone: bool = True, + assume_straight_pages: bool = True, + preserve_aspect_ratio: bool = True, + symmetric_pad: bool = True, + export_as_straight_boxes: bool = False, + detect_orientation: bool = False, + straighten_pages: bool = False, + detect_language: bool = False, + **kwargs: Any, +) -> KIEPredictor: + """End-to-end KIE architecture using one model for localization, and another for text recognition. + + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) + + Args: + ---- + det_arch: name of the detection architecture or the model itself to use + (e.g. 'db_resnet50', 'db_mobilenet_v3_large') + reco_arch: name of the recognition architecture or the model itself to use + (e.g. 'crnn_vgg16_bn', 'sar_resnet31') + pretrained: If True, returns a model pre-trained on our OCR dataset + pretrained_backbone: If True, returns a model with a pretrained backbone + assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages + without rotated textual elements. + preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before + running the detection model on it. + symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. + export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions + (potentially rotated) as straight bounding boxes. + detect_orientation: if True, the estimated general page orientation will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + straighten_pages: if True, estimates the page general orientation + based on the segmentation map median line orientation. + Then, rotates page before passing it again to the deep learning detection module. + Doing so will improve performances for documents with page-uniform rotations. + detect_language: if True, the language prediction will be added to the predictions for each + page. Doing so will slightly deteriorate the overall latency. + kwargs: keyword args of `OCRPredictor` + + Returns: + ------- + KIE predictor + """ + return _kie_predictor( + det_arch, + reco_arch, + pretrained, + pretrained_backbone=pretrained_backbone, + assume_straight_pages=assume_straight_pages, + preserve_aspect_ratio=preserve_aspect_ratio, + symmetric_pad=symmetric_pad, + export_as_straight_boxes=export_as_straight_boxes, + detect_orientation=detect_orientation, + straighten_pages=straighten_pages, + detect_language=detect_language, + **kwargs, + )
@@ -353,8 +575,8 @@

Source code for doctr.models.zoo

       
     
   
- - + + diff --git a/v0.6.0/_modules/doctr/transforms/modules.html b/v0.6.0/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.6.0/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/transforms/modules/base.html b/v0.6.0/_modules/doctr/transforms/modules/base.html index c42079a8fd..4596df3848 100644 --- a/v0.6.0/_modules/doctr/transforms/modules/base.html +++ b/v0.6.0/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.base

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
+import math
 import random
-from typing import List, Any, Callable
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import numpy as np
 
 from doctr.utils.repr import NestedObject
+
 from .. import functional as F
 
+__all__ = ["SampleCompose", "ImageTransform", "ColorInversion", "OneOf", "RandomApply", "RandomRotate", "RandomCrop"]
+
+
+class SampleCompose(NestedObject):
+    """Implements a wrapper that will apply transformations sequentially on both image and target
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfo = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), np.zeros((2, 4)))
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import numpy as np
+                >>> import torch
+                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
+                >>> transfos = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
+                >>> out, out_boxes = transfos(torch.rand(8, 64, 64, 3), np.zeros((2, 4)))
+
+    Args:
+    ----
+        transforms: list of transformation modules
+    """
+
+    _children_names: List[str] = ["sample_transforms"]
+
+    def __init__(self, transforms: List[Callable[[Any, Any], Tuple[Any, Any]]]) -> None:
+        self.sample_transforms = transforms
+
+    def __call__(self, x: Any, target: Any) -> Tuple[Any, Any]:
+        for t in self.sample_transforms:
+            x, target = t(x, target)
+
+        return x, target
+
+
+class ImageTransform(NestedObject):
+    """Implements a transform wrapper to turn an image-only transformation into an image+target transform
+
+    .. tabs::
+
+        .. tab:: TensorFlow
+
+            .. code:: python
+
+                >>> import tensorflow as tf
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), None)
+
+        .. tab:: PyTorch
+
+            .. code:: python
+
+                >>> import torch
+                >>> from doctr.transforms import ImageTransform, ColorInversion
+                >>> transfo = ImageTransform(ColorInversion((32, 32)))
+                >>> out, _ = transfo(torch.rand(8, 64, 64, 3), None)
+
+    Args:
+    ----
+        transform: the image transformation module to wrap
+    """
+
+    _children_names: List[str] = ["img_transform"]
+
+    def __init__(self, transform: Callable[[Any], Any]) -> None:
+        self.img_transform = transform
 
-__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
+    def __call__(self, img: Any, target: Any) -> Tuple[Any, Any]:
+        img = self.img_transform(img)
+        return img, target
 
 
 
-[docs] +[docs] class ColorInversion(NestedObject): """Applies the following tranformation to a tensor (image or batch of images): convert to grayscale, colorize (shift 0-values randomly), and then invert colors - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import ColorInversion + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(torch.rand(8, 64, 64, 3)) Args: + ---- min_val: range [min_val, 1] to colorize RGB pixels """ + def __init__(self, min_val: float = 0.5) -> None: self.min_val = min_val @@ -316,59 +437,178 @@

Source code for doctr.transforms.modules.base

-[docs] +[docs] class OneOf(NestedObject): """Randomly apply one of the input transformations - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import OneOf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transforms: list of transformations, one only will be picked """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: # Pick transformation transfo = self.transforms[int(random.random() * len(self.transforms))] # Apply - return transfo(img)
+ return transfo(img) if target is None else transfo(img, target) # type: ignore[call-arg]
-[docs] +[docs] class RandomApply(NestedObject): """Apply with a probability p the input transformation - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + .. tabs:: + + .. tab:: TensorFlow + + .. code:: python + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + .. tab:: PyTorch + + .. code:: python + + >>> import torch + >>> from doctr.transforms import RandomApply + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(torch.rand(1, 64, 64, 3)) Args: + ---- transform: transformation to apply p: probability to apply """ - def __init__(self, transform: Callable[[Any], Any], p: float = .5) -> None: + + def __init__(self, transform: Callable[[Any], Any], p: float = 0.5) -> None: self.transform = transform self.p = p def extra_repr(self) -> str: return f"transform={self.transform}, p={self.p}" - def __call__(self, img: Any) -> Any: + def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]: if random.random() < self.p: - return self.transform(img) - return img
+ return self.transform(img) if target is None else self.transform(img, target) # type: ignore[call-arg] + return img if target is None else (img, target)
+ + + +
+[docs] +class RandomRotate(NestedObject): + """Randomly rotate a tensor image and its boxes + + .. image:: https://doctr-static.mindee.com/models?id=v0.4.0/rotation_illustration.png&src=0 + :align: center + + Args: + ---- + max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in + [-max_angle, max_angle] + expand: whether the image should be padded before the rotation + """ + + def __init__(self, max_angle: float = 5.0, expand: bool = False) -> None: + self.max_angle = max_angle + self.expand = expand + + def extra_repr(self) -> str: + return f"max_angle={self.max_angle}, expand={self.expand}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + angle = random.uniform(-self.max_angle, self.max_angle) + r_img, r_polys = F.rotate_sample(img, target, angle, self.expand) + # Removes deleted boxes + is_kept = (r_polys.max(1) > r_polys.min(1)).sum(1) == 2 + return r_img, r_polys[is_kept]
+ + + +
+[docs] +class RandomCrop(NestedObject): + """Randomly crop a tensor image and its boxes + + Args: + ---- + scale: tuple of floats, relative (min_area, max_area) of the crop + ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w + """ + + def __init__(self, scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: + self.scale = scale + self.ratio = ratio + + def extra_repr(self) -> str: + return f"scale={self.scale}, ratio={self.ratio}" + + def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: + scale = random.uniform(self.scale[0], self.scale[1]) + ratio = random.uniform(self.ratio[0], self.ratio[1]) + + height, width = img.shape[:2] + + # Calculate crop size + crop_area = scale * width * height + aspect_ratio = ratio * (width / height) + crop_width = int(round(math.sqrt(crop_area * aspect_ratio))) + crop_height = int(round(math.sqrt(crop_area / aspect_ratio))) + + # Ensure crop size does not exceed image dimensions + crop_width = min(crop_width, width) + crop_height = min(crop_height, height) + + # Randomly select crop position + x = random.randint(0, width - crop_width) + y = random.randint(0, height - crop_height) + + # relative crop box + crop_box = (x / width, y / height, (x + crop_width) / width, (y + crop_height) / height) + if target.shape[1:] == (4, 2): + min_xy = np.min(target, axis=1) + max_xy = np.max(target, axis=1) + _target = np.concatenate((min_xy, max_xy), axis=1) + else: + _target = target + + # Crop image and targets + croped_img, crop_boxes = F.crop_detection(img, _target, crop_box) + # hard fallback if no box is kept + if crop_boxes.shape[0] == 0: + return img, target + # clip boxes + return croped_img, np.clip(crop_boxes, 0, 1)
@@ -402,8 +642,8 @@

Source code for doctr.transforms.modules.base

- - + + diff --git a/v0.6.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.6.0/_modules/doctr/transforms/modules/tensorflow.html index 1d192a876b..acbbe96225 100644 --- a/v0.6.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.6.0/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.transforms.modules.tensorflow

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
 import random
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
 import tensorflow as tf
-from typing import List, Any, Tuple, Callable
 
 from doctr.utils.repr import NestedObject
 
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'RandomBrightness',
-           'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality']
+from ..functional.tensorflow import _gaussian_filter, random_shadow
+
+__all__ = [
+    "Compose",
+    "Resize",
+    "Normalize",
+    "LambdaTransformation",
+    "ToGray",
+    "RandomBrightness",
+    "RandomContrast",
+    "RandomSaturation",
+    "RandomHue",
+    "RandomGamma",
+    "RandomJpegQuality",
+    "GaussianBlur",
+    "ChannelShuffle",
+    "GaussianNoise",
+    "RandomHorizontalFlip",
+    "RandomShadow",
+    "RandomResize",
+]
 
 
 
-[docs] +[docs] class Compose(NestedObject): """Implements a wrapper that will apply transformations sequentially - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Compose, Resize + >>> transfos = Compose([Resize((32, 32))]) + >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- transforms: list of transformation modules """ - _children_names: List[str] = ['transforms'] + _children_names: List[str] = ["transforms"] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms @@ -319,26 +361,27 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class Resize(NestedObject): """Resizes a tensor to a target size - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Resize + >>> transfo = Resize((32, 32)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- output_size: expected output size method: interpolation method preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically """ + def __init__( self, - output_size: Tuple[int, int], - method: str = 'bilinear', + output_size: Union[int, Tuple[int, int]], + method: str = "bilinear", preserve_aspect_ratio: bool = False, symmetric_pad: bool = False, ) -> None: @@ -346,6 +389,14 @@

Source code for doctr.transforms.modules.tensorflow

self.method = method self.preserve_aspect_ratio = preserve_aspect_ratio self.symmetric_pad = symmetric_pad + self.antialias = True + + if isinstance(self.output_size, int): + self.wanted_size = (self.output_size, self.output_size) + elif isinstance(self.output_size, (tuple, list)): + self.wanted_size = self.output_size + else: + raise AssertionError("Output size should be either a list, a tuple or an int") def extra_repr(self) -> str: _repr = f"output_size={self.output_size}, method='{self.method}'" @@ -353,64 +404,106 @@

Source code for doctr.transforms.modules.tensorflow

_repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" return _repr - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) + def __call__( + self, + img: tf.Tensor, + target: Optional[np.ndarray] = None, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: + input_dtype = img.dtype + self.output_size = ( + (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size + ) + + img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias) + # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio + raw_shape = img.shape[:2] + if self.symmetric_pad: + half_pad = (int((self.output_size[0] - img.shape[0]) / 2), 0) if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
+ if isinstance(self.output_size, (tuple, list)): + # In that case we need to pad because we want to enforce both width and height + if not self.symmetric_pad: + half_pad = (0, 0) + elif self.output_size[0] == img.shape[0]: + half_pad = (0, int((self.output_size[1] - img.shape[1]) / 2)) + # Pad image + img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size) + + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) + if target is not None: + if self.symmetric_pad: + offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1] + + if self.preserve_aspect_ratio: + # Get absolute coords + if target.shape[1:] == (4,): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1] + target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0] + else: + target[:, [0, 2]] *= raw_shape[1] / img.shape[1] + target[:, [1, 3]] *= raw_shape[0] / img.shape[0] + elif target.shape[1:] == (4, 2): + if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: + target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1] + target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0] + else: + target[..., 0] *= raw_shape[1] / img.shape[1] + target[..., 1] *= raw_shape[0] / img.shape[0] + else: + raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") + + return tf.cast(img, dtype=input_dtype), np.clip(target, 0, 1) + + return tf.cast(img, dtype=input_dtype)
-[docs] +[docs] class Normalize(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import Normalize + >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- mean: average value per channel std: standard deviation per channel """ + def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) + self.mean = tf.constant(mean) + self.std = tf.constant(std) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std + img -= tf.cast(self.mean, dtype=img.dtype) + img /= tf.cast(self.std, dtype=img.dtype) return img
-[docs] +[docs] class LambdaTransformation(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import LambdaTransformation + >>> transfo = LambdaTransformation(lambda x: x/ 255.) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- fn: the function to be applied to the input tensor """ + def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: self.fn = fn @@ -420,37 +513,42 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class ToGray(NestedObject): """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import ToGray + >>> transfo = ToGray() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) """ + + def __init__(self, num_output_channels: int = 1): + self.num_output_channels = num_output_channels + def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
+ img = tf.image.rgb_to_grayscale(img) + return img if self.num_output_channels == 1 else tf.repeat(img, self.num_output_channels, axis=-1)
-[docs] +[docs] class RandomBrightness(NestedObject): """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta to all pixels - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomBrightness + >>> transfo = RandomBrightness() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] p: probability to apply transformation """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -463,21 +561,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomContrast(NestedObject): """Randomly adjust contrast of a tensor (batch of images or image) by adjusting each pixel: (img - mean) * contrast_factor + mean. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomContrast + >>> transfo = RandomContrast() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) """ - def __init__(self, delta: float = .3) -> None: + + def __init__(self, delta: float = 0.3) -> None: self.delta = delta def extra_repr(self) -> str: @@ -489,21 +588,22 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomSaturation(NestedObject): """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and increasing saturation by a factor. - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomSaturation + >>> transfo = RandomSaturation() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) """ - def __init__(self, delta: float = .5) -> None: + + def __init__(self, delta: float = 0.5) -> None: self.delta = delta def extra_repr(self) -> str: @@ -515,19 +615,20 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomHue(NestedObject): """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHue + >>> transfo = RandomHue() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] """ + def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -540,22 +641,23 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomGamma(NestedObject): """randomly performs gamma correction for a tensor (batch of images or image) - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomGamma + >>> transfo = RandomGamma() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: + ---- min_gamma: non-negative real number, lower bound for gamma param max_gamma: non-negative real number, upper bound for gamma min_gain: lower bound for constant multiplier max_gain: upper bound for constant multiplier """ + def __init__( self, min_gamma: float = 0.5, @@ -580,20 +682,21 @@

Source code for doctr.transforms.modules.tensorflow

-[docs] +[docs] class RandomJpegQuality(NestedObject): """Randomly adjust jpeg quality of a 3 dimensional RGB image - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + >>> import tensorflow as tf + >>> from doctr.transforms import RandomJpegQuality + >>> transfo = RandomJpegQuality() + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: + ---- min_quality: int between [0, 100] max_quality: int between [0, 100] """ + def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: self.min_quality = min_quality self.max_quality = max_quality @@ -602,10 +705,224 @@

Source code for doctr.transforms.modules.tensorflow

return f"min_quality={self.min_quality}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality + return tf.image.random_jpeg_quality(img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality)
+ + + +
+[docs] +class GaussianBlur(NestedObject): + """Randomly adjust jpeg quality of a 3 dimensional RGB image + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianBlur + >>> transfo = GaussianBlur(3, (.1, 5)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + kernel_shape: size of the blurring kernel + std: min and max value of the standard deviation + """ + + def __init__(self, kernel_shape: Union[int, Iterable[int]], std: Tuple[float, float]) -> None: + self.kernel_shape = kernel_shape + self.std = std + + def extra_repr(self) -> str: + return f"kernel_shape={self.kernel_shape}, std={self.std}" + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.squeeze( + _gaussian_filter( + img[tf.newaxis, ...], + kernel_size=self.kernel_shape, + sigma=random.uniform(self.std[0], self.std[1]), + mode="REFLECT", + ), + axis=0, )
+ + +
+[docs] +class ChannelShuffle(NestedObject): + """Randomly shuffle channel order of a given image""" + + def __init__(self): + pass + + def __call__(self, img: tf.Tensor) -> tf.Tensor: + return tf.transpose(tf.random.shuffle(tf.transpose(img, perm=[2, 0, 1])), perm=[1, 2, 0])
+ + + +
+[docs] +class GaussianNoise(NestedObject): + """Adds Gaussian Noise to the input tensor + + >>> import tensorflow as tf + >>> from doctr.transforms import GaussianNoise + >>> transfo = GaussianNoise(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + mean : mean of the gaussian distribution + std : std of the gaussian distribution + """ + + def __init__(self, mean: float = 0.0, std: float = 1.0) -> None: + super().__init__() + self.std = std + self.mean = mean + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + noise = self.mean + 2 * self.std * tf.random.uniform(x.shape) - self.std + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value(tf.math.round(tf.cast(x, dtype=tf.float32) + 255 * noise), 0, 255), dtype=tf.uint8 + ) + else: + return tf.cast(tf.clip_by_value(x + noise, 0, 1), dtype=x.dtype) + + def extra_repr(self) -> str: + return f"mean={self.mean}, std={self.std}"
+ + + +
+[docs] +class RandomHorizontalFlip(NestedObject): + """Adds random horizontal flip to the input tensor/np.ndarray + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomHorizontalFlip + >>> transfo = RandomHorizontalFlip(p=0.5) + >>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1) + >>> target = np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32) + >>> out = transfo(image, target) + + Args: + ---- + p : probability of Horizontal Flip + """ + + def __init__(self, p: float) -> None: + super().__init__() + self.p = p + + def __call__(self, img: Union[tf.Tensor, np.ndarray], target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + _img = tf.image.flip_left_right(img) + _target = target.copy() + # Changing the relative bbox coordinates + if target.shape[1:] == (4,): + _target[:, ::2] = 1 - target[:, [2, 0]] + else: + _target[..., 0] = 1 - target[..., 0] + return _img, _target + return img, target
+ + + +
+[docs] +class RandomShadow(NestedObject): + """Adds random shade to the input image + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomShadow + >>> transfo = RandomShadow(0., 1.) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + opacity_range : minimum and maximum opacity of the shade + """ + + def __init__(self, opacity_range: Optional[Tuple[float, float]] = None) -> None: + super().__init__() + self.opacity_range = opacity_range if isinstance(opacity_range, tuple) else (0.2, 0.8) + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + # Reshape the distribution + if x.dtype == tf.uint8: + return tf.cast( + tf.clip_by_value( + tf.math.round(255 * random_shadow(tf.cast(x, dtype=tf.float32) / 255, self.opacity_range)), + 0, + 255, + ), + dtype=tf.uint8, + ) + else: + return tf.clip_by_value(random_shadow(x, self.opacity_range), 0, 1) + + def extra_repr(self) -> str: + return f"opacity_range={self.opacity_range}"
+ + + +
+[docs] +class RandomResize(NestedObject): + """Randomly resize the input image and align corresponding targets + + >>> import tensorflow as tf + >>> from doctr.transforms import RandomResize + >>> transfo = RandomResize((0.3, 0.9), preserve_aspect_ratio=True, symmetric_pad=True, p=0.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + + Args: + ---- + scale_range: range of the resizing factor for width and height (independently) + preserve_aspect_ratio: whether to preserve the aspect ratio of the image, + given a float value, the aspect ratio will be preserved with this probability + symmetric_pad: whether to symmetrically pad the image, + given a float value, the symmetric padding will be applied with this probability + p: probability to apply the transformation + """ + + def __init__( + self, + scale_range: Tuple[float, float] = (0.3, 0.9), + preserve_aspect_ratio: Union[bool, float] = False, + symmetric_pad: Union[bool, float] = False, + p: float = 0.5, + ): + super().__init__() + self.scale_range = scale_range + self.preserve_aspect_ratio = preserve_aspect_ratio + self.symmetric_pad = symmetric_pad + self.p = p + self._resize = Resize + + def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + if np.random.rand(1) <= self.p: + scale_h = random.uniform(*self.scale_range) + scale_w = random.uniform(*self.scale_range) + new_size = (int(img.shape[-3] * scale_h), int(img.shape[-2] * scale_w)) + + _img, _target = self._resize( + new_size, + preserve_aspect_ratio=self.preserve_aspect_ratio + if isinstance(self.preserve_aspect_ratio, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + symmetric_pad=self.symmetric_pad + if isinstance(self.symmetric_pad, bool) + else bool(np.random.rand(1) <= self.symmetric_pad), + )(img, target) + + return _img, _target + return img, target + + def extra_repr(self) -> str: + return f"scale_range={self.scale_range}, preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}, p={self.p}" # noqa: E501
+
@@ -638,8 +955,8 @@

Source code for doctr.transforms.modules.tensorflow

- +
+ diff --git a/v0.6.0/_modules/doctr/utils/metrics.html b/v0.6.0/_modules/doctr/utils/metrics.html index 460c64a385..8a37d5949a 100644 --- a/v0.6.0/_modules/doctr/utils/metrics.html +++ b/v0.6.0/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.metrics

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-import cv2
-from typing import List, Tuple, Dict, Optional
-from unidecode import unidecode
+from anyascii import anyascii
 from scipy.optimize import linear_sum_assignment
-from doctr.utils.geometry import rbbox_to_polygon
+from shapely.geometry import Polygon
 
-__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'rbox_to_mask',
-           'nms', 'LocalizationConfusion', 'OCRMetric']
+__all__ = [
+    "TextMatch",
+    "box_iou",
+    "polygon_iou",
+    "nms",
+    "LocalizationConfusion",
+    "OCRMetric",
+    "DetectionMetric",
+]
 
 
 def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
-    """Perform string comparison with multiple levels of tolerance
+    """Performs string comparison with multiple levels of tolerance
 
     Args:
+    ----
         word1: a string
         word2: another string
 
     Returns:
+    -------
         a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
-            unidecode counterparts and their lower-case unidecode counterparts match
+            anyascii counterparts and their lower-case anyascii counterparts match
     """
-    raw_match = (word1 == word2)
-    caseless_match = (word1.lower() == word2.lower())
-    unidecode_match = (unidecode(word1) == unidecode(word2))
+    raw_match = word1 == word2
+    caseless_match = word1.lower() == word2.lower()
+    anyascii_match = anyascii(word1) == anyascii(word2)
 
     # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
-    unicase_match = (unidecode(word1).lower() == unidecode(word2).lower())
+    unicase_match = anyascii(word1).lower() == anyascii(word2).lower()
 
-    return raw_match, caseless_match, unidecode_match, unicase_match
+    return raw_match, caseless_match, anyascii_match, unicase_match
 
 
 
-[docs] +[docs] class TextMatch: - """Implements text match metric (word-level accuracy) for recognition task. + r"""Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \\forall X, Y \\in \\mathcal{W}^N, - TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) + \forall X, Y \in \mathcal{W}^N, + TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \\forall a, x \\in \\mathcal{W}, - f_a(x) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } x = a \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{W}` is the set of all possible character sequences, + \forall a, x \in \mathcal{W}, + f_a(x) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } x = a \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. - Example:: - >>> from doctr.utils import TextMatch - >>> metric = TextMatch() - >>> metric.update(['Hello', 'world'], ['hello', 'world']) - >>> metric.summary() + >>> from doctr.utils import TextMatch + >>> metric = TextMatch() + >>> metric.update(['Hello', 'world'], ['hello', 'world']) + >>> metric.summary() """ def __init__(self) -> None: self.reset() +
+[docs] def update( self, gt: List[str], @@ -354,29 +386,32 @@

Source code for doctr.utils.metrics

         """Update the state of the metric with new predictions
 
         Args:
+        ----
             gt: list of groung-truth character sequences
-            pred: list of predicted character sequences"""
-
+            pred: list of predicted character sequences
+        """
         if len(gt) != len(pred):
             raise AssertionError("prediction size does not match with ground-truth labels size")
 
         for gt_word, pred_word in zip(gt, pred):
-            _raw, _caseless, _unidecode, _unicase = string_match(gt_word, pred_word)
+            _raw, _caseless, _anyascii, _unicase = string_match(gt_word, pred_word)
             self.raw += int(_raw)
             self.caseless += int(_caseless)
-            self.unidecode += int(_unidecode)
+            self.anyascii += int(_anyascii)
             self.unicase += int(_unicase)
 
-        self.total += len(gt)
+        self.total += len(gt)
+
-[docs] +[docs] def summary(self) -> Dict[str, float]: """Computes the aggregated metrics - Returns: - a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode - counterpart and its lower-case unidecode counterpart + Returns + ------- + a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii + counterpart and its lower-case anyascii counterpart """ if self.total == 0: raise AssertionError("you need to update the metric before getting the summary") @@ -384,7 +419,7 @@

Source code for doctr.utils.metrics

         return dict(
             raw=self.raw / self.total,
             caseless=self.caseless / self.total,
-            unidecode=self.unidecode / self.total,
+            anyascii=self.anyascii / self.total,
             unicase=self.unicase / self.total,
         )
@@ -392,23 +427,25 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.raw = 0
         self.caseless = 0
-        self.unidecode = 0
+        self.anyascii = 0
         self.unicase = 0
         self.total = 0
def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray: - """Compute the IoU between two sets of bounding boxes + """Computes the IoU between two sets of bounding boxes Args: + ---- boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax) boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax) + Returns: + ------- the IoU matrix of shape (N, M) """ - - iou_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) + iou_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0: l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1) @@ -419,107 +456,54 @@

Source code for doctr.utils.metrics

         right = np.minimum(r1, r2.T)
         bot = np.minimum(b1, b2.T)
 
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
+        intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
         union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
         iou_mat = intersection / union
 
     return iou_mat
 
 
-def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
-    """Compute the IoA (intersection over area) between two sets of bounding boxes:
-    ioa(i, j) = inter(i, j) / area(i)
-
-    Args:
-        boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
-        boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
-    Returns:
-        the IoA matrix of shape (N, M)
-    """
-
-    ioa_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
-
-    if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
-        l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
-        l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1)
-
-        left = np.maximum(l1, l2.T)
-        top = np.maximum(t1, t2.T)
-        right = np.minimum(r1, r2.T)
-        bot = np.minimum(b1, b2.T)
-
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
-        area = (r1 - l1) * (b1 - t1)
-        ioa_mat = intersection / area
-
-    return ioa_mat
-
-
-def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
-    """Compute the IoU between two sets of boolean masks
+def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray:
+    """Computes the IoU between two sets of rotated bounding boxes
 
     Args:
-        masks_1: boolean masks of shape (N, H, W)
-        masks_2: boolean masks of shape (M, H, W)
+    ----
+        polys_1: rotated bounding boxes of shape (N, 4, 2)
+        polys_2: rotated bounding boxes of shape (M, 4, 2)
+        mask_shape: spatial shape of the intermediate masks
+        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
 
     Returns:
+    -------
         the IoU matrix of shape (N, M)
     """
+    if polys_1.ndim != 3 or polys_2.ndim != 3:
+        raise AssertionError("expects boxes to be in format (N, 4, 2)")
 
-    if masks_1.shape[1:] != masks_2.shape[1:]:
-        raise AssertionError("both boolean masks should have the same spatial shape")
+    iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
 
-    iou_mat = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
+    shapely_polys_1 = [Polygon(poly) for poly in polys_1]
+    shapely_polys_2 = [Polygon(poly) for poly in polys_2]
 
-    if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
-        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...])
-        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...])
-        axes = tuple(range(2, masks_1.ndim + 1))
-        iou_mat = intersection.sum(axis=axes) / union.sum(axis=axes)
+    for i, poly1 in enumerate(shapely_polys_1):
+        for j, poly2 in enumerate(shapely_polys_2):
+            intersection_area = poly1.intersection(poly2).area
+            union_area = poly1.area + poly2.area - intersection_area
+            iou_mat[i, j] = intersection_area / union_area
 
     return iou_mat
 
 
-def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
-    """Convert boxes to masks
-
-    Args:
-        boxes: rotated bounding boxes of shape (N, 5) in format (x, y, w, h, alpha)
-        shape: spatial shapes of the output masks
-
-    Returns:
-        the boolean masks of shape (N, H, W)
-    """
-
-    masks = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
-
-    if boxes.shape[0] > 0:
-        # Get absolute coordinates
-        if boxes.dtype != np.int:
-            abs_boxes = boxes.copy()
-            abs_boxes[:, [0, 2]] = abs_boxes[:, [0, 2]] * shape[1]
-            abs_boxes[:, [1, 3]] = abs_boxes[:, [1, 3]] * shape[0]
-            abs_boxes = abs_boxes.round().astype(np.int)
-        else:
-            abs_boxes = boxes
-            abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
-
-        # TODO: optimize slicing to improve vectorization
-        for idx, _box in enumerate(abs_boxes):
-            box = rbbox_to_polygon(_box)
-            cv2.fillPoly(masks[idx], [np.array(box, np.int32)], 1)
-
-    return masks.astype(bool)
-
-
-def nms(boxes: np.ndarray, thresh: float = .5) -> List[int]:
+def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
     """Perform non-max suppression, borrowed from <https://github.com/rbgirshick/fast-rcnn>`_.
 
     Args:
+    ----
         boxes: np array of straight boxes: (*, 5), (xmin, ymin, xmax, ymax, score)
         thresh: iou threshold to perform box suppression.
 
     Returns:
+    -------
         A list of box indexes to keep
     """
     x1 = boxes[:, 0]
@@ -551,66 +535,71 @@ 

Source code for doctr.utils.metrics

 
 
 
-[docs] +[docs] class LocalizationConfusion: - """Implements common confusion metrics and mean IoU for localization evaluation. + r"""Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ - Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ - meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) + \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ + Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ + Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ + meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \\forall y \\in \\mathcal{B}, - g_X(y) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, + \forall y \in \mathcal{B}, + g_X(y) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import LocalizationConfusion - >>> metric = LocalizationConfusion(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import LocalizationConfusion + >>> metric = LocalizationConfusion(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: + """Updates the metric + Args: + ---- + gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + """ if preds.shape[0] > 0: # Compute IoU - if self.rotated_bbox: - mask_gts = rbox_to_mask(gts, shape=self.mask_shape) - mask_preds = rbox_to_mask(preds, shape=self.mask_shape) - iou_mat = mask_iou(mask_gts, mask_preds) + if self.use_polygons: + iou_mat = polygon_iou(gts, preds) else: iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=1).sum()) + self.tot_iou += float(iou_mat.max(axis=0).sum()) # Assign pairs gt_indices, pred_indices = linear_sum_assignment(-iou_mat) @@ -618,17 +607,18 @@

Source code for doctr.utils.metrics

 
         # Update counts
         self.num_gts += gts.shape[0]
-        self.num_preds += preds.shape[0]
+        self.num_preds += preds.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: """Computes the aggregated metrics - Returns: + Returns + ------- a tuple with the recall, precision and meanIoU scores """ - # Recall recall = self.matches / self.num_gts if self.num_gts > 0 else None @@ -636,7 +626,7 @@

Source code for doctr.utils.metrics

         precision = self.matches / self.num_preds if self.num_preds > 0 else None
 
         # mean IoU
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -645,64 +635,65 @@

Source code for doctr.utils.metrics

         self.num_gts = 0
         self.num_preds = 0
         self.matches = 0
-        self.tot_iou = 0.
+ self.tot_iou = 0.0
-[docs] +[docs] class OCRMetric: - """Implements end-to-end OCR metric. + r"""Implements an end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, - \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ - Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ - meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) + \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, + \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ + Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, - h_{B,L}(b, l) = \\left\\{ - \\begin{array}{ll} - 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ - & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ - 0 & \\mbox{otherwise.} - \\end{array} - \\right. - - where :math:`\\mathcal{B}` is the set of possible bounding boxes, - :math:`\\mathcal{L}` is the set of possible character sequences, + \forall (b, l) \in \mathcal{B} \times \mathcal{L}, + h_{B,L}(b, l) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - Example:: - >>> import numpy as np - >>> from doctr.utils import OCRMetric - >>> metric = OCRMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - ['hello'], ['hello', 'world']) - >>> metric.summary() + >>> import numpy as np + >>> from doctr.utils import OCRMetric + >>> metric = OCRMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> ['hello'], ['hello', 'world']) + >>> metric.summary() Args: + ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format """ def __init__( self, iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), + use_polygons: bool = False, ) -> None: self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape + self.use_polygons = use_polygons self.reset() +
+[docs] def update( self, gt_boxes: np.ndarray, @@ -710,50 +701,58 @@

Source code for doctr.utils.metrics

         gt_labels: List[str],
         pred_labels: List[str],
     ) -> None:
+        """Updates the metric
 
+        Args:
+        ----
+            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
+            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
+            gt_labels: a list of N string labels
+            pred_labels: a list of M string labels
+        """
         if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
-            raise AssertionError("there should be the same number of boxes and string both for the ground truth "
-                                 "and the predictions")
+            raise AssertionError(
+                "there should be the same number of boxes and string both for the ground truth and the predictions"
+            )
 
         # Compute IoU
         if pred_boxes.shape[0] > 0:
-            if self.rotated_bbox:
-                mask_gts = rbox_to_mask(gt_boxes, shape=self.mask_shape)
-                mask_preds = rbox_to_mask(pred_boxes, shape=self.mask_shape)
-                iou_mat = mask_iou(mask_gts, mask_preds)
+            if self.use_polygons:
+                iou_mat = polygon_iou(gt_boxes, pred_boxes)
             else:
                 iou_mat = box_iou(gt_boxes, pred_boxes)
 
-            self.tot_iou += float(iou_mat.max(axis=1).sum())
+            self.tot_iou += float(iou_mat.max(axis=0).sum())
 
             # Assign pairs
             gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
             is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh
             # String comparison
             for gt_idx, pred_idx in zip(gt_indices[is_kept], pred_indices[is_kept]):
-                _raw, _caseless, _unidecode, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
+                _raw, _caseless, _anyascii, _unicase = string_match(gt_labels[gt_idx], pred_labels[pred_idx])
                 self.raw_matches += int(_raw)
                 self.caseless_matches += int(_caseless)
-                self.unidecode_matches += int(_unidecode)
+                self.anyascii_matches += int(_anyascii)
                 self.unicase_matches += int(_unicase)
 
         self.num_gts += gt_boxes.shape[0]
-        self.num_preds += pred_boxes.shape[0]
+        self.num_preds += pred_boxes.shape[0]
+
-[docs] +[docs] def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float]], Optional[float]]: """Computes the aggregated metrics - Returns: - a tuple with the recall & precision for each string comparison flexibility and the mean IoU + Returns + ------- + a tuple with the recall & precision for each string comparison and the mean IoU """ - # Recall recall = dict( raw=self.raw_matches / self.num_gts if self.num_gts > 0 else None, caseless=self.caseless_matches / self.num_gts if self.num_gts > 0 else None, - unidecode=self.unidecode_matches / self.num_gts if self.num_gts > 0 else None, + anyascii=self.anyascii_matches / self.num_gts if self.num_gts > 0 else None, unicase=self.unicase_matches / self.num_gts if self.num_gts > 0 else None, ) @@ -761,12 +760,12 @@

Source code for doctr.utils.metrics

         precision = dict(
             raw=self.raw_matches / self.num_preds if self.num_preds > 0 else None,
             caseless=self.caseless_matches / self.num_preds if self.num_preds > 0 else None,
-            unidecode=self.unidecode_matches / self.num_preds if self.num_preds > 0 else None,
+            anyascii=self.anyascii_matches / self.num_preds if self.num_preds > 0 else None,
             unicase=self.unicase_matches / self.num_preds if self.num_preds > 0 else None,
         )
 
         # mean IoU (overall detected boxes)
-        mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None
+        mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None
 
         return recall, precision, mean_iou
@@ -774,12 +773,136 @@

Source code for doctr.utils.metrics

     def reset(self) -> None:
         self.num_gts = 0
         self.num_preds = 0
-        self.tot_iou = 0.
+        self.tot_iou = 0.0
         self.raw_matches = 0
         self.caseless_matches = 0
-        self.unidecode_matches = 0
+        self.anyascii_matches = 0
         self.unicase_matches = 0
+ + +
+[docs] +class DetectionMetric: + r"""Implements an object detection metric. + + The aggregated metrics are computed as follows: + + .. math:: + \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, + \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ + Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ + meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + + with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and + :math:`y`, and the function :math:`h_{B, C}` defined as: + + .. math:: + \forall (b, c) \in \mathcal{B} \times \mathcal{C}, + h_{B,C}(b, c) = \left\{ + \begin{array}{ll} + 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ + & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ + 0 & \mbox{otherwise.} + \end{array} + \right. + + where :math:`\mathcal{B}` is the set of possible bounding boxes, + :math:`\mathcal{C}` is the set of possible class indices, + :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. + + >>> import numpy as np + >>> from doctr.utils import DetectionMetric + >>> metric = DetectionMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + >>> np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) + >>> metric.summary() + + Args: + ---- + iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match + use_polygons: if set to True, predictions and targets will be expected to have rotated format + """ + + def __init__( + self, + iou_thresh: float = 0.5, + use_polygons: bool = False, + ) -> None: + self.iou_thresh = iou_thresh + self.use_polygons = use_polygons + self.reset() + +
+[docs] + def update( + self, + gt_boxes: np.ndarray, + pred_boxes: np.ndarray, + gt_labels: np.ndarray, + pred_labels: np.ndarray, + ) -> None: + """Updates the metric + + Args: + ---- + gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones + pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones + gt_labels: an array of class indices of shape (N,) + pred_labels: an array of class indices of shape (M,) + """ + if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: + raise AssertionError( + "there should be the same number of boxes and string both for the ground truth and the predictions" + ) + + # Compute IoU + if pred_boxes.shape[0] > 0: + if self.use_polygons: + iou_mat = polygon_iou(gt_boxes, pred_boxes) + else: + iou_mat = box_iou(gt_boxes, pred_boxes) + + self.tot_iou += float(iou_mat.max(axis=0).sum()) + + # Assign pairs + gt_indices, pred_indices = linear_sum_assignment(-iou_mat) + is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh + # Category comparison + self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) + + self.num_gts += gt_boxes.shape[0] + self.num_preds += pred_boxes.shape[0]
+ + +
+[docs] + def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: + """Computes the aggregated metrics + + Returns + ------- + a tuple with the recall & precision for each class prediction and the mean IoU + """ + # Recall + recall = self.num_matches / self.num_gts if self.num_gts > 0 else None + + # Precision + precision = self.num_matches / self.num_preds if self.num_preds > 0 else None + + # mean IoU (overall detected boxes) + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None + + return recall, precision, mean_iou
+ + + def reset(self) -> None: + self.num_gts = 0 + self.num_preds = 0 + self.tot_iou = 0.0 + self.num_matches = 0
+
@@ -812,8 +935,8 @@

Source code for doctr.utils.metrics

       
     
   
- - + + diff --git a/v0.6.0/_modules/doctr/utils/visualization.html b/v0.6.0/_modules/doctr/utils/visualization.html index 8e7dcca811..c818be6d7b 100644 --- a/v0.6.0/_modules/doctr/utils/visualization.html +++ b/v0.6.0/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -225,20 +225,42 @@

Source code for doctr.utils.visualization

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import colorsys
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import matplotlib.pyplot as plt
-from matplotlib.figure import Figure
+import cv2
 import matplotlib.patches as patches
-import mplcursors
-from PIL import ImageFont, ImageDraw, Image
+import matplotlib.pyplot as plt
 import numpy as np
-import cv2
-from typing import Tuple, List, Dict, Any, Union
+from matplotlib.figure import Figure
 
-from .common_types import BoundingBox, RotatedBbox
+from .common_types import BoundingBox, Polygon4P
 
-__all__ = ['visualize_page', 'synthetize_page']
+__all__ = ["visualize_page", "visualize_kie_page", "draw_boxes"]
 
 
-def create_rect_patch(
-    geometry: Union[BoundingBox, RotatedBbox],
-    label: str,
+def rect_patch(
+    geometry: BoundingBox,
     page_dimensions: Tuple[int, int],
-    color: Tuple[int, int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
     alpha: float = 0.3,
     linewidth: int = 2,
     fill: bool = True,
-) -> patches.Patch:
-    """Create a matplotlib patch (rectangle) bounding the element
+    preserve_aspect_ratio: bool = False,
+) -> patches.Rectangle:
+    """Create a matplotlib rectangular patch for the element
 
     Args:
+    ----
         geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
         label: label to display when hovered
-        page_dimensions: dimensions of the Page
         color: color to draw box
         alpha: opacity parameter to fill the boxes, 0 = transparent
         linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
 
     Returns:
+    -------
         a rectangular Patch
     """
+    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
     height, width = page_dimensions
-    if len(geometry) == 5:
-        x, y, w, h, a = geometry  # type: ignore[misc]
-        x, w = x * width, w * width
-        y, h = y * height, h * height
-        points = cv2.boxPoints(((x, y), (w, h), a))
-        return patches.Polygon(
-            points,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
-    else:
-        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
-        xmin, xmax = xmin * width, xmax * width
-        ymin, ymax = ymin * height, ymax * height
-        return patches.Rectangle(
-            (xmin, ymin),
-            xmax - xmin,
-            ymax - ymin,
-            fill=fill,
-            linewidth=linewidth,
-            edgecolor=(*color, alpha),
-            facecolor=(*color, alpha),
-            label=label
-        )
+    (xmin, ymin), (xmax, ymax) = geometry
+    # Switch to absolute coords
+    if preserve_aspect_ratio:
+        width = height = max(height, width)
+    xmin, w = xmin * width, (xmax - xmin) * width
+    ymin, h = ymin * height, (ymax - ymin) * height
+
+    return patches.Rectangle(
+        (xmin, ymin),
+        w,
+        h,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def polygon_patch(
+    geometry: np.ndarray,
+    page_dimensions: Tuple[int, int],
+    label: Optional[str] = None,
+    color: Tuple[float, float, float] = (0, 0, 0),
+    alpha: float = 0.3,
+    linewidth: int = 2,
+    fill: bool = True,
+    preserve_aspect_ratio: bool = False,
+) -> patches.Polygon:
+    """Create a matplotlib polygon patch for the element
+
+    Args:
+    ----
+        geometry: bounding box of the element
+        page_dimensions: dimensions of the Page in format (height, width)
+        label: label to display when hovered
+        color: color to draw box
+        alpha: opacity parameter to fill the boxes, 0 = transparent
+        linewidth: line width
+        fill: whether the patch should be filled
+        preserve_aspect_ratio: pass True if you passed True to the predictor
+
+    Returns:
+    -------
+        a polygon Patch
+    """
+    if not geometry.shape == (4, 2):
+        raise ValueError("invalid geometry format")
+
+    # Unpack
+    height, width = page_dimensions
+    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
+    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
+
+    return patches.Polygon(
+        geometry,
+        fill=fill,
+        linewidth=linewidth,
+        edgecolor=(*color, alpha),
+        facecolor=(*color, alpha),
+        label=label,
+    )
+
+
+def create_obj_patch(
+    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
+    page_dimensions: Tuple[int, int],
+    **kwargs: Any,
+) -> patches.Patch:
+    """Create a matplotlib patch for the element
+
+    Args:
+    ----
+        geometry: bounding box (straight or rotated) of the element
+        page_dimensions: dimensions of the page in format (height, width)
+        **kwargs: keyword arguments for the patch
+
+    Returns:
+    -------
+        a matplotlib Patch
+    """
+    if isinstance(geometry, tuple):
+        if len(geometry) == 2:  # straight word BB (2 pts)
+            return rect_patch(geometry, page_dimensions, **kwargs)
+        elif len(geometry) == 4:  # rotated word BB (4 pts)
+            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)
+    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
+        return polygon_patch(geometry, page_dimensions, **kwargs)
+    raise ValueError("invalid geometry format")
+
+
+def get_colors(num_colors: int) -> List[Tuple[float, float, float]]:
+    """Generate num_colors color for matplotlib
+
+    Args:
+    ----
+        num_colors: number of colors to generate
+
+    Returns:
+    -------
+        colors: list of generated colors
+    """
+    colors = []
+    for i in np.arange(0.0, 360.0, 360.0 / num_colors):
+        hue = i / 360.0
+        lightness = (50 + np.random.rand() * 10) / 100.0
+        saturation = (90 + np.random.rand() * 10) / 100.0
+        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
+    return colors
 
 
 
-[docs] +[docs] def visualize_page( page: Dict[str, Any], image: np.ndarray, @@ -359,18 +472,18 @@

Source code for doctr.utils.visualization

 ) -> Figure:
     """Visualize a full page with predicted blocks, lines and words
 
-    Example::
-        >>> import numpy as np
-        >>> import matplotlib.pyplot as plt
-        >>> from doctr.utils.visualization import visualize_page
-        >>> from doctr.models import ocr_db_crnn
-        >>> model = ocr_db_crnn(pretrained=True)
-        >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
-        >>> out = model([[input_page]])
-        >>> visualize_page(out[0].pages[0].export(), input_page)
-        >>> plt.show()
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from doctr.utils.visualization import visualize_page
+    >>> from doctr.models import ocr_db_crnn
+    >>> model = ocr_db_crnn(pretrained=True)
+    >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
+    >>> out = model([[input_page]])
+    >>> visualize_page(out[0].pages[0].export(), input_page)
+    >>> plt.show()
 
     Args:
+    ----
         page: the exported Page of a Document
         image: np array of the page, needs to have the same shape than page['dimensions']
         words_only: whether only words should be displayed
@@ -378,6 +491,11 @@ 

Source code for doctr.utils.visualization

         scale: figsize of the largest windows side
         interactive: whether the plot should be interactive
         add_labels: for static plot, adds text labels on top of bounding box
+        **kwargs: keyword arguments for the polygon patch
+
+    Returns:
+    -------
+        the matplotlib figure
     """
     # Get proper scale and aspect ratio
     h, w = image.shape[:2]
@@ -386,128 +504,189 @@ 

Source code for doctr.utils.visualization

     # Display the image
     ax.imshow(image)
     # hide both axis
-    ax.axis('off')
+    ax.axis("off")
 
     if interactive:
         artists: List[patches.Patch] = []  # instantiate an empty list of patches (to be drawn on the page)
 
-    for block in page['blocks']:
+    for block in page["blocks"]:
         if not words_only:
-            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
+            rect = create_obj_patch(
+                block["geometry"], page["dimensions"], label="block", color=(0, 1, 0), linewidth=1, **kwargs
+            )
             # add patch on figure
             ax.add_patch(rect)
             if interactive:
                 # add patch to cursor's artists
                 artists.append(rect)
 
-        for line in block['lines']:
+        for line in block["lines"]:
             if not words_only:
-                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
+                rect = create_obj_patch(
+                    line["geometry"], page["dimensions"], label="line", color=(1, 0, 0), linewidth=1, **kwargs
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
-            for word in line['words']:
-                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
-                                         page['dimensions'], (0, 0, 1), **kwargs)
+            for word in line["words"]:
+                rect = create_obj_patch(
+                    word["geometry"],
+                    page["dimensions"],
+                    label=f"{word['value']} (confidence: {word['confidence']:.2%})",
+                    color=(0, 0, 1),
+                    **kwargs,
+                )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
                 elif add_labels:
-                    if len(word['geometry']) == 5:
+                    if len(word["geometry"]) == 5:
                         text_loc = (
-                            int(page['dimensions'][1] * (word['geometry'][0] - word['geometry'][2] / 2)),
-                            int(page['dimensions'][0] * (word['geometry'][1] - word['geometry'][3] / 2))
+                            int(page["dimensions"][1] * (word["geometry"][0] - word["geometry"][2] / 2)),
+                            int(page["dimensions"][0] * (word["geometry"][1] - word["geometry"][3] / 2)),
                         )
                     else:
                         text_loc = (
-                            int(page['dimensions'][1] * word['geometry'][0][0]),
-                            int(page['dimensions'][0] * word['geometry'][0][1])
+                            int(page["dimensions"][1] * word["geometry"][0][0]),
+                            int(page["dimensions"][0] * word["geometry"][0][1]),
+                        )
+
+                    if len(word["geometry"]) == 2:
+                        # We draw only if boxes are in straight format
+                        ax.text(
+                            *text_loc,
+                            word["value"],
+                            size=10,
+                            alpha=0.5,
+                            color=(0, 0, 1),
                         )
-                    ax.text(
-                        *text_loc,
-                        word['value'],
-                        size=10,
-                        alpha=0.5,
-                        color=(0, 0, 1),
-                    )
 
         if display_artefacts:
-            for artefact in block['artefacts']:
-                rect = create_rect_patch(
-                    artefact['geometry'],
-                    'artefact',
-                    page['dimensions'],
-                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
+            for artefact in block["artefacts"]:
+                rect = create_obj_patch(
+                    artefact["geometry"],
+                    page["dimensions"],
+                    label="artefact",
+                    color=(0.5, 0.5, 0.5),
                     linewidth=1,
-                    **kwargs
+                    **kwargs,
                 )
                 ax.add_patch(rect)
                 if interactive:
                     artists.append(rect)
 
     if interactive:
+        import mplcursors
+
         # Create mlp Cursor to hover patches in artists
         mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
-    fig.tight_layout(pad=0.)
+    fig.tight_layout(pad=0.0)
 
     return fig
-def synthetize_page( +def visualize_kie_page( page: Dict[str, Any], - draw_proba: bool = False, - font_size: int = 13, -) -> np.ndarray: - """Draw a the content of the element page (OCR response) on a blank page. + image: np.ndarray, + words_only: bool = False, + display_artefacts: bool = True, + scale: float = 10, + interactive: bool = True, + add_labels: bool = True, + **kwargs: Any, +) -> Figure: + """Visualize a full page with predicted blocks, lines and words + + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from doctr.utils.visualization import visualize_page + >>> from doctr.models import ocr_db_crnn + >>> model = ocr_db_crnn(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([[input_page]]) + >>> visualize_kie_page(out[0].pages[0].export(), input_page) + >>> plt.show() Args: - page: exported Page object to represent - draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0 - font_size: size of the font, default font = 13 + ---- + page: the exported Page of a Document + image: np array of the page, needs to have the same shape than page['dimensions'] + words_only: whether only words should be displayed + display_artefacts: whether artefacts should be displayed + scale: figsize of the largest windows side + interactive: whether the plot should be interactive + add_labels: for static plot, adds text labels on top of bounding box + **kwargs: keyword arguments for the polygon patch - Return: - A np array (drawn page) + Returns: + ------- + the matplotlib figure """ - # Draw template - h, w = page["dimensions"] - response = 255 * np.ones((h, w, 3), dtype=np.int32) + # Get proper scale and aspect ratio + h, w = image.shape[:2] + size = (scale * w / h, scale) if h > w else (scale, h / w * scale) + fig, ax = plt.subplots(figsize=size) + # Display the image + ax.imshow(image) + # hide both axis + ax.axis("off") - # Draw each word - for block in page["blocks"]: - for line in block["lines"]: - for word in line["words"]: - # Get aboslute word geometry - (xmin, ymin), (xmax, ymax) = word["geometry"] - xmin, xmax = int(w * xmin), int(w * xmax) - ymin, ymax = int(h * ymin), int(h * ymax) - - # White drawing context adapted to font size, 0.75 factor to convert pts --> pix - h_box, w_box = ymax - ymin, xmax - xmin - h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75)) - img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255)) - d = ImageDraw.Draw(img) - - # Draw in black the value of the word - d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0)) - - # Resize back to box size - img = img.resize((w_box, h_box), Image.NEAREST) - - # Colorize if draw_proba - if draw_proba: - p = int(255 * word["confidence"]) - mask = np.where(np.array(img) == 0, 1, 0) - proba = np.array([255 - p, 0, p]) - color = mask * proba[np.newaxis, np.newaxis, :] - white_mask = 255 * (1 - mask) - img = color + white_mask - - # Write to response page - response[ymin:ymax, xmin:xmax, :] = np.array(img) - - return response + if interactive: + artists: List[patches.Patch] = [] # instantiate an empty list of patches (to be drawn on the page) + + colors = {k: color for color, k in zip(get_colors(len(page["predictions"])), page["predictions"])} + for key, value in page["predictions"].items(): + for prediction in value: + if not words_only: + rect = create_obj_patch( + prediction["geometry"], + page["dimensions"], + label=f"{key} \n {prediction['value']} (confidence: {prediction['confidence']:.2%}", + color=colors[key], + linewidth=1, + **kwargs, + ) + # add patch on figure + ax.add_patch(rect) + if interactive: + # add patch to cursor's artists + artists.append(rect) + + if interactive: + import mplcursors + + # Create mlp Cursor to hover patches in artists + mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label())) + fig.tight_layout(pad=0.0) + + return fig + + +def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None: + """Draw an array of relative straight boxes on an image + + Args: + ---- + boxes: array of relative boxes, of shape (*, 4) + image: np array, float32 or uint8 + color: color to use for bounding box edges + **kwargs: keyword arguments from `matplotlib.pyplot.plot` + """ + h, w = image.shape[:2] + # Convert boxes to absolute coords + _boxes = deepcopy(boxes) + _boxes[:, [0, 2]] *= w + _boxes[:, [1, 3]] *= h + _boxes = _boxes.astype(np.int32) + for box in _boxes.tolist(): + xmin, ymin, xmax, ymax = box + image = cv2.rectangle( + image, (xmin, ymin), (xmax, ymax), color=color if isinstance(color, tuple) else (0, 0, 255), thickness=2 + ) + plt.imshow(image) + plt.plot(**kwargs)
@@ -540,8 +719,8 @@

Source code for doctr.utils.visualization

       
     
   
- - + + diff --git a/v0.6.0/_modules/index.html b/v0.6.0/_modules/index.html index e86abcd4d4..5793c44f20 100644 --- a/v0.6.0/_modules/index.html +++ b/v0.6.0/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -225,20 +225,42 @@ - - + + diff --git a/v0.6.0/_sources/changelog.rst.txt b/v0.6.0/_sources/changelog.rst.txt index 430097d6c8..35befe7b96 100644 --- a/v0.6.0/_sources/changelog.rst.txt +++ b/v0.6.0/_sources/changelog.rst.txt @@ -1,6 +1,54 @@ Changelog ========= +v0.10.0 (2024-10-21) +------------------- +Release note: `v0.10.0 `_ + +v0.9.0 (2024-08-08) +------------------- +Release note: `v0.9.0 `_ + +v0.8.1 (2024-03-04) +------------------- +Release note: `v0.8.1 `_ + +v0.8.0 (2024-02-28) +------------------- +Release note: `v0.8.0 `_ + +v0.7.0 (2023-09-09) +------------------- +Release note: `v0.7.0 `_ + +v0.6.0 (2022-09-29) +------------------- +Release note: `v0.6.0 `_ + +v0.5.1 (2022-03-22) +------------------- +Release note: `v0.5.1 `_ + +v0.5.0 (2021-12-31) +------------------- +Release note: `v0.5.0 `_ + +v0.4.1 (2021-11-22) +------------------- +Release note: `v0.4.1 `_ + +v0.4.0 (2021-10-01) +------------------- +Release note: `v0.4.0 `_ + +v0.3.1 (2021-08-27) +------------------- +Release note: `v0.3.1 `_ + +v0.3.0 (2021-07-02) +------------------- +Release note: `v0.3.0 `_ + v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.6.0/_sources/datasets.rst.txt b/v0.6.0/_sources/datasets.rst.txt deleted file mode 100644 index 354122f1e5..0000000000 --- a/v0.6.0/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.datasets.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.6.0/_sources/documents.rst.txt b/v0.6.0/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.6.0/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.6.0/_sources/getting_started/installing.rst.txt b/v0.6.0/_sources/getting_started/installing.rst.txt index e764e734a7..39e79aa3dd 100644 --- a/v0.6.0/_sources/getting_started/installing.rst.txt +++ b/v0.6.0/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.9 or higher. +This library requires `Python `_ 3.10 or higher. Prerequisites diff --git a/v0.6.0/_sources/index.rst.txt b/v0.6.0/_sources/index.rst.txt index fc3ff89fdf..53251db142 100644 --- a/v0.6.0/_sources/index.rst.txt +++ b/v0.6.0/_sources/index.rst.txt @@ -1,7 +1,8 @@ -DocTR: Document Text Recognition -================================ +******************************** +docTR: Document Text Recognition +******************************** -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -9,38 +10,29 @@ State-of-the-art Optical Character Recognition made seamless & accessible to any DocTR provides an easy and powerful way to extract valuable information from your documents: -* |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. +* |:receipt:| **for automation**: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. -Welcome to the documentation of `DocTR `_! - - Main Features ------------- * |:robot:| Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor -* |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract +* |:rocket:| State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, small dependencies -* |:tools:| Daily maintained -* |:factory:| Easy integration - +* |:bird:| Light package, minimal dependencies +* |:tools:| Actively maintained by Mindee +* |:factory:| Easy integration (available templates for browser demo & API deployment) -Getting Started ---------------- .. toctree:: :maxdepth: 2 + :caption: Getting started + :hidden: - installing - - -Build & train your predictor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) -* Fine-tune or train from scratch any detection or recognition model to specialize on your data + getting_started/installing + notebooks Model zoo @@ -48,36 +40,83 @@ Model zoo Text detection models """"""""""""""""""""" - * `DBNet `_ (Differentiable Binarization) - * `LinkNet `_ +* DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ +* LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ +* FAST from `"FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation" `_ Text recognition models """"""""""""""""""""""" - * `SAR `_ (Show, Attend and Read) - * `CRNN `_ (Convolutional Recurrent Neural Network) - * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) +* SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ +* CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ +* MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ +* ViTSTR from `"Vision Transformer for Fast and Efficient Scene Text Recognition" `_ +* PARSeq from `"Scene Text Recognition with Permuted Autoregressive Sequence Models" `_ Supported datasets ^^^^^^^^^^^^^^^^^^ - * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. - * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. - * SROIE from `ICDAR 2019 `_. +* FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. +* CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. +* SROIE from `ICDAR 2019 `_. +* IIIT-5k from `CVIT `_. +* Street View Text from `"End-to-End Scene Text Recognition" `_. +* SynthText from `Visual Geometry Group `_. +* SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. +* IC03 from `ICDAR 2003 `_. +* IC13 from `ICDAR 2013 `_. +* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. +* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" `_. +* IIITHWS from `"Generating Synthetic Data for Text Recognition" `_. +* WILDRECEIPT from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction" `_. .. toctree:: :maxdepth: 2 - :caption: Notes + :caption: Using docTR + :hidden: - changelog + using_doctr/using_models + using_doctr/using_datasets + using_doctr/using_contrib_modules + using_doctr/sharing_models + using_doctr/using_model_export + using_doctr/custom_models_training + using_doctr/running_on_aws + + +.. toctree:: + :maxdepth: 2 + :caption: Community + :hidden: + + community/resources .. toctree:: :maxdepth: 2 :caption: Package Reference + :hidden: - datasets - documents - models - transforms - utils + modules/contrib + modules/datasets + modules/io + modules/models + modules/transforms + modules/utils + + +.. toctree:: + :maxdepth: 2 + :caption: Contributing + :hidden: + + contributing/code_of_conduct + contributing/contributing + + +.. toctree:: + :maxdepth: 2 + :caption: Notes + :hidden: + + changelog diff --git a/v0.6.0/_sources/installing.rst.txt b/v0.6.0/_sources/installing.rst.txt deleted file mode 100644 index 5c8779dc1c..0000000000 --- a/v0.6.0/_sources/installing.rst.txt +++ /dev/null @@ -1,46 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: - -* TensorFlow: `installation page `_. -* PyTorch: `installation page `_. - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.6.0/_sources/models.rst.txt b/v0.6.0/_sources/models.rst.txt deleted file mode 100644 index 9830c6c153..0000000000 --- a/v0.6.0/_sources/models.rst.txt +++ /dev/null @@ -1,215 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet16 - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 -.. autofunction:: doctr.models.recognition.master - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.6.0/_sources/transforms.rst.txt b/v0.6.0/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.6.0/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.6.0/_sources/utils.rst.txt b/v0.6.0/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.6.0/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.6.0/_static/basic.css b/v0.6.0/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.6.0/_static/basic.css +++ b/v0.6.0/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.6.0/_static/doctools.js b/v0.6.0/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.6.0/_static/doctools.js +++ b/v0.6.0/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.6.0/_static/documentation_options.js b/v0.6.0/_static/documentation_options.js index a7b5cbe04a..4f656fdbea 100644 --- a/v0.6.0/_static/documentation_options.js +++ b/v0.6.0/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.3.0a0-git', + VERSION: '0.10.1a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.6.0/_static/language_data.js b/v0.6.0/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.6.0/_static/language_data.js +++ b/v0.6.0/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.6.0/_static/searchtools.js b/v0.6.0/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.6.0/_static/searchtools.js +++ b/v0.6.0/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.6.0/changelog.html b/v0.6.0/changelog.html index eafac3a877..fc45a50384 100644 --- a/v0.6.0/changelog.html +++ b/v0.6.0/changelog.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + Changelog - docTR documentation @@ -226,20 +226,42 @@ + diff --git a/v0.6.0/community/resources.html b/v0.6.0/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.6.0/community/resources.html +++ b/v0.6.0/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.6.0/contributing/code_of_conduct.html b/v0.6.0/contributing/code_of_conduct.html index 5ea4a1f99d..03422dbb4d 100644 --- a/v0.6.0/contributing/code_of_conduct.html +++ b/v0.6.0/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -504,7 +504,7 @@

Attribution - + diff --git a/v0.6.0/contributing/contributing.html b/v0.6.0/contributing/contributing.html index e5a85682c6..05e2b3641b 100644 --- a/v0.6.0/contributing/contributing.html +++ b/v0.6.0/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -481,7 +481,7 @@

Let’s connect - + diff --git a/v0.6.0/datasets.html b/v0.6.0/datasets.html deleted file mode 100644 index 193e576c57..0000000000 --- a/v0.6.0/datasets.html +++ /dev/null @@ -1,578 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
  • sos – optional encoding of Start Of String

  • -
  • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/documents.html b/v0.6.0/documents.html deleted file mode 100644 index 98cbb2c5ef..0000000000 --- a/v0.6.0/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/genindex.html b/v0.6.0/genindex.html index a19b433943..21520455b4 100644 --- a/v0.6.0/genindex.html +++ b/v0.6.0/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -224,20 +224,42 @@

+
+

U

+ + +
+
+

V

@@ -561,7 +711,13 @@

V

W

+
@@ -599,8 +755,8 @@

W

- - + + diff --git a/v0.6.0/getting_started/installing.html b/v0.6.0/getting_started/installing.html index a488e9a030..af3b58193e 100644 --- a/v0.6.0/getting_started/installing.html +++ b/v0.6.0/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -305,7 +305,7 @@

Installation

-

This library requires Python 3.9 or higher.

+

This library requires Python 3.10 or higher.

Prerequisites

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

@@ -435,7 +435,7 @@

Via Git - + diff --git a/v0.6.0/index.html b/v0.6.0/index.html index 4c6a28c66a..3a06afc6d9 100644 --- a/v0.6.0/index.html +++ b/v0.6.0/index.html @@ -12,9 +12,9 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + - + docTR documentation @@ -226,20 +226,42 @@
-

DocTR: Document Text Recognition

-

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta)

+

docTR: Document Text Recognition

+

State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch

https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png

DocTR provides an easy and powerful way to extract valuable information from your documents:

    -
  • 🧾 for automation: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • +
  • 🧾 for automation: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

  • 👩‍🔬 for research: quickly compare your own architectures speed & performances with state-of-art models on public datasets.

-

Welcome to the documentation of DocTR!

Main Features

  • 🤖 Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters

  • ⚡ User-friendly, 3 lines of code to load a document and extract text with a predictor

  • -
  • 🚀 State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract

  • +
  • 🚀 State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract

  • ⚡ Optimized for inference speed on both CPU & GPU

  • -
  • 🐦 Light package, small dependencies

  • -
  • 🛠️ Daily maintained

  • -
  • 🏭 Easy integration

  • +
  • 🐦 Light package, minimal dependencies

  • +
  • 🛠️ Actively maintained by Mindee

  • +
  • 🏭 Easy integration (available templates for browser demo & API deployment)

-
-
-

Getting Started

-
-

Build & train your predictor

-
    -
  • Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained)

  • -
  • Fine-tune or train from scratch any detection or recognition model to specialize on your data

  • -
-

Model zoo

Text detection models

-
-

Text recognition models

-
-

Supported datasets

-
-
+
+
+
+
+
@@ -406,7 +381,7 @@

Supported datasets - +
Next @@ -446,10 +421,8 @@

Supported datasets + diff --git a/v0.6.0/installing.html b/v0.6.0/installing.html deleted file mode 100644 index b61c60134b..0000000000 --- a/v0.6.0/installing.html +++ /dev/null @@ -1,395 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

- -

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/models.html b/v0.6.0/models.html deleted file mode 100644 index b5cd44c9fa..0000000000 --- a/v0.6.0/models.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet16
->>> model = linknet16(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
-

MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. -Example:

-
>>> import tensorflow as tf
->>> from doctr.models import master
->>> model = master(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

59.50

62.50

75.30

70.00

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

66.00

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/modules/contrib.html b/v0.6.0/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.6.0/modules/contrib.html +++ b/v0.6.0/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.6.0/modules/datasets.html b/v0.6.0/modules/datasets.html index 456e10b172..380a986793 100644 --- a/v0.6.0/modules/datasets.html +++ b/v0.6.0/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1081,7 +1081,7 @@

Returns:

- + diff --git a/v0.6.0/modules/io.html b/v0.6.0/modules/io.html index 01eadaa4b8..24c41954be 100644 --- a/v0.6.0/modules/io.html +++ b/v0.6.0/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -760,7 +760,7 @@

Returns: - + diff --git a/v0.6.0/modules/models.html b/v0.6.0/modules/models.html index c465cc0586..91b8810a6a 100644 --- a/v0.6.0/modules/models.html +++ b/v0.6.0/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1612,7 +1612,7 @@

Args: - + diff --git a/v0.6.0/modules/transforms.html b/v0.6.0/modules/transforms.html index 30f7a2631a..c5ead3f3ce 100644 --- a/v0.6.0/modules/transforms.html +++ b/v0.6.0/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -835,7 +835,7 @@

Args:< - + diff --git a/v0.6.0/modules/utils.html b/v0.6.0/modules/utils.html index 888a32c321..b7f6fc570b 100644 --- a/v0.6.0/modules/utils.html +++ b/v0.6.0/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -715,7 +715,7 @@

Args: - + diff --git a/v0.6.0/notebooks.html b/v0.6.0/notebooks.html index f97771aebb..d36539f59e 100644 --- a/v0.6.0/notebooks.html +++ b/v0.6.0/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -391,7 +391,7 @@

docTR Notebooks - + diff --git a/v0.6.0/objects.inv b/v0.6.0/objects.inv index a22d2ce821..c1700f291b 100644 Binary files a/v0.6.0/objects.inv and b/v0.6.0/objects.inv differ diff --git a/v0.6.0/py-modindex.html b/v0.6.0/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.6.0/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.6.0/search.html b/v0.6.0/search.html index 73772822d2..d050f5eac7 100644 --- a/v0.6.0/search.html +++ b/v0.6.0/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -226,20 +226,42 @@ - - + + diff --git a/v0.6.0/searchindex.js b/v0.6.0/searchindex.js index 803f4f4bcf..6f154115ab 100644 --- a/v0.6.0/searchindex.js +++ b/v0.6.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": 5, "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": 5, "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "2023": [], "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": 3, "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": [], "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "andrej": [], "angl": 2, "ani": [1, 2, 3, 5, 6, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": [], "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "articl": [], "artifici": [], "arxiv": 5, "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": [3, 6], "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "baranovskij": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 3, "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": 2, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [1, 2, 6, 7], "box": [1, 2, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "christian": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": [], "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "cool": [], "coordin": 2, "cord": [1, 3, 5], "core": 7, "corner": [], "correct": 6, "correspond": [4, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": 4, "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "found": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gallagh": [], "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ibrahimov": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": [1, 2], "insult": [], "int": [1, 2, 5, 6, 7], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "jame": [], "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": 4, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [3, 5], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "moscardi": [], "most": 5, "mozilla": [], "multi": 3, "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": [], "netraj": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 3, 6, 7], "none": [1, 2, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 3, "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": 1, "order": [1, 2, 5], "org": 5, "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 5, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [4, 5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "parallel": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "patil": [], "pattern": [], "pdf": [2, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [3, 4], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": [], "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "realli": [], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": 4, "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "roboflow": [], "robust": 3, "root": 1, "rotat": [1, 2], "rotated_bbox": [1, 7], "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sanjin": [], "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": [3, 5], "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [1, 4], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standalon": [], "standard": 6, "start": 1, "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": [], "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unit": [], "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "verma": [], "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "video": [], "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": 4, "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "yugesh": [], "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "21": [], "22": [], "27": [], "28": 0, "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": [], "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "resourc": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[2, "correction"]], "2. Warning": [[2, "warning"]], "3. Temporary Ban": [[2, "temporary-ban"]], "4. Permanent Ban": [[2, "permanent-ban"]], "AWS Lambda": [[14, null]], "Advanced options": [[19, "advanced-options"]], "Args:": [[7, "args"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"], [7, "id16"], [7, "id19"], [7, "id22"], [7, "id25"], [7, "id29"], [7, "id32"], [7, "id37"], [7, "id40"], [7, "id46"], [7, "id49"], [7, "id50"], [7, "id51"], [7, "id54"], [7, "id57"], [7, "id60"], [7, "id61"], [8, "args"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id10"], [8, "id12"], [8, "id14"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id28"], [9, "args"], [9, "id3"], [9, "id8"], [9, "id13"], [9, "id17"], [9, "id21"], [9, "id26"], [9, "id31"], [9, "id36"], [9, "id41"], [9, "id46"], [9, "id50"], [9, "id54"], [9, "id59"], [9, "id63"], [9, "id68"], [9, "id73"], [9, "id77"], [9, "id81"], [9, "id85"], [9, "id90"], [9, "id95"], [9, "id99"], [9, "id104"], [9, "id109"], [9, "id114"], [9, "id119"], [9, "id123"], [9, "id127"], [9, "id132"], [9, "id137"], [9, "id142"], [9, "id146"], [9, "id150"], [9, "id155"], [9, "id159"], [9, "id163"], [9, "id167"], [9, "id169"], [9, "id171"], [9, "id173"], [10, "args"], [10, "id1"], [10, "id2"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"], [10, "id10"], [10, "id11"], [10, "id12"], [10, "id13"], [10, "id14"], [10, "id15"], [10, "id16"], [10, "id17"], [10, "id18"], [10, "id19"], [11, "args"], [11, "id3"], [11, "id4"], [11, "id5"], [11, "id6"], [11, "id7"], [11, "id8"], [11, "id9"]], "Artefact": [[8, "artefact"]], "ArtefactDetection": [[16, "artefactdetection"]], "Attribution": [[2, "attribution"]], "Available Datasets": [[17, "available-datasets"]], "Available architectures": [[19, "available-architectures"], [19, "id1"], [19, "id2"]], "Available contribution modules": [[16, "available-contribution-modules"]], "Block": [[8, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[17, null]], "Choosing the right model": [[19, null]], "Classification": [[15, "classification"]], "Code quality": [[3, "code-quality"]], "Code style verification": [[3, "code-style-verification"]], "Codebase structure": [[3, "codebase-structure"]], "Commits": [[3, "commits"]], "Community resources": [[1, null]], "Composing transformations": [[10, "composing-transformations"]], "Continuous Integration": [[3, "continuous-integration"]], "Contributing to docTR": [[3, null]], "Contributor Covenant Code of Conduct": [[2, null]], "Custom dataset loader": [[7, "custom-dataset-loader"]], "Custom orientation classification models": [[13, "custom-orientation-classification-models"]], "Data Loading": [[17, "data-loading"]], "Dataloader": [[7, "dataloader"]], "Detection": [[15, "detection"], [17, "detection"]], "Detection predictors": [[19, "detection-predictors"]], "Developer mode installation": [[3, "developer-mode-installation"]], "Developing docTR": [[3, "developing-doctr"]], "Document": [[8, "document"]], "Document structure": [[8, "document-structure"]], "End-to-End OCR": [[19, "end-to-end-ocr"]], "Enforcement": [[2, "enforcement"]], "Enforcement Guidelines": [[2, "enforcement-guidelines"]], "Enforcement Responsibilities": [[2, "enforcement-responsibilities"]], "Export to ONNX": [[18, "export-to-onnx"]], "Feature requests & bug report": [[3, "feature-requests-bug-report"]], "Feedback": [[3, "feedback"]], "File reading": [[8, "file-reading"]], "Half-precision": [[18, "half-precision"]], "Installation": [[4, null]], "Integrate contributions into your pipeline": [[16, null]], "Let\u2019s connect": [[3, "let-s-connect"]], "Line": [[8, "line"]], "Loading from Huggingface Hub": [[15, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[13, "loading-your-custom-trained-model"]], "Loading your custom trained orientation classification model": [[13, "loading-your-custom-trained-orientation-classification-model"]], "Main Features": [[5, "main-features"]], "Model optimization": [[18, "model-optimization"]], "Model zoo": [[5, "model-zoo"]], "Modifying the documentation": [[3, "modifying-the-documentation"]], "Naming conventions": [[15, "naming-conventions"]], "OCR": [[17, "ocr"]], "Object Detection": [[17, "object-detection"]], "Our Pledge": [[2, "our-pledge"]], "Our Standards": [[2, "our-standards"]], "Page": [[8, "page"]], "Preparing your model for inference": [[18, null]], "Prerequisites": [[4, "prerequisites"]], "Pretrained community models": [[15, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[15, "pushing-to-the-huggingface-hub"]], "Questions": [[3, "questions"]], "Recognition": [[15, "recognition"], [17, "recognition"]], "Recognition predictors": [[19, "recognition-predictors"]], "Returns:": [[7, "returns"], [8, "returns"], [8, "id11"], [8, "id13"], [8, "id15"], [8, "id19"], [8, "id23"], [8, "id27"], [8, "id31"], [9, "returns"], [9, "id6"], [9, "id11"], [9, "id16"], [9, "id20"], [9, "id24"], [9, "id29"], [9, "id34"], [9, "id39"], [9, "id44"], [9, "id49"], [9, "id53"], [9, "id57"], [9, "id62"], [9, "id66"], [9, "id71"], [9, "id76"], [9, "id80"], [9, "id84"], [9, "id88"], [9, "id93"], [9, "id98"], [9, "id102"], [9, "id107"], [9, "id112"], [9, "id117"], [9, "id122"], [9, "id126"], [9, "id130"], [9, "id135"], [9, "id140"], [9, "id145"], [9, "id149"], [9, "id153"], [9, "id158"], [9, "id162"], [9, "id166"], [9, "id168"], [9, "id170"], [9, "id172"], [11, "returns"]], "Scope": [[2, "scope"]], "Share your model with the community": [[15, null]], "Supported Vocabs": [[7, "supported-vocabs"]], "Supported contribution modules": [[6, "supported-contribution-modules"]], "Supported datasets": [[5, "supported-datasets"]], "Supported transformations": [[10, "supported-transformations"]], "Synthetic dataset generator": [[7, "synthetic-dataset-generator"], [17, "synthetic-dataset-generator"]], "Task evaluation": [[11, "task-evaluation"]], "Text Detection": [[19, "text-detection"]], "Text Recognition": [[19, "text-recognition"]], "Text detection models": [[5, "text-detection-models"]], "Text recognition models": [[5, "text-recognition-models"]], "Train your own model": [[13, null]], "Two-stage approaches": [[19, "two-stage-approaches"]], "Unit tests": [[3, "unit-tests"]], "Use your own datasets": [[17, "use-your-own-datasets"]], "Using your ONNX exported model": [[18, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[4, "via-conda-only-for-linux"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[11, "visualization"]], "What should I do with the output?": [[19, "what-should-i-do-with-the-output"]], "Word": [[8, "word"]], "docTR Notebooks": [[12, null]], "docTR Vocabs": [[7, "id62"]], "docTR: Document Text Recognition": [[5, null]], "doctr.contrib": [[6, null]], "doctr.datasets": [[7, null], [7, "datasets"]], "doctr.io": [[8, null]], "doctr.models": [[9, null]], "doctr.models.classification": [[9, "doctr-models-classification"]], "doctr.models.detection": [[9, "doctr-models-detection"]], "doctr.models.factory": [[9, "doctr-models-factory"]], "doctr.models.recognition": [[9, "doctr-models-recognition"]], "doctr.models.zoo": [[9, "doctr-models-zoo"]], "doctr.transforms": [[10, null]], "doctr.utils": [[11, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.10.0 (2024-10-21)": [[0, "v0-10-0-2024-10-21"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]], "v0.9.0 (2024-08-08)": [[0, "v0-9-0-2024-08-08"]]}, "docnames": ["changelog", "community/resources", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "community/resources.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[8, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[8, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[10, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[7, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[10, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[10, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[7, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[7, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[9, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[7, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[7, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[8, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[8, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[7, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[9, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[8, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[7, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[10, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[10, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[7, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[7, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[7, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[7, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[7, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[9, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[10, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[8, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[9, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[7, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[10, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[9, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[7, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[10, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[8, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[9, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[9, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[10, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[10, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[10, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[10, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[10, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[10, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[10, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[10, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[10, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[10, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[10, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[10, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[8, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[8, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[8, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[8, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[7, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[10, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[9, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[8, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[8, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[7, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[7, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[7, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[7, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[11, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[9, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[10, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[11, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[11, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[11, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[11, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[9, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[11, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[9, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[9, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[7, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[8, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[7, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[7, 0, 1, "", "CORD"], [7, 0, 1, "", "CharacterGenerator"], [7, 0, 1, "", "DetectionDataset"], [7, 0, 1, "", "DocArtefacts"], [7, 0, 1, "", "FUNSD"], [7, 0, 1, "", "IC03"], [7, 0, 1, "", "IC13"], [7, 0, 1, "", "IIIT5K"], [7, 0, 1, "", "IIITHWS"], [7, 0, 1, "", "IMGUR5K"], [7, 0, 1, "", "MJSynth"], [7, 0, 1, "", "OCRDataset"], [7, 0, 1, "", "RecognitionDataset"], [7, 0, 1, "", "SROIE"], [7, 0, 1, "", "SVHN"], [7, 0, 1, "", "SVT"], [7, 0, 1, "", "SynthText"], [7, 0, 1, "", "WILDRECEIPT"], [7, 0, 1, "", "WordGenerator"], [7, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[7, 0, 1, "", "DataLoader"]], "doctr.io": [[8, 0, 1, "", "Artefact"], [8, 0, 1, "", "Block"], [8, 0, 1, "", "Document"], [8, 0, 1, "", "DocumentFile"], [8, 0, 1, "", "Line"], [8, 0, 1, "", "Page"], [8, 0, 1, "", "Word"], [8, 1, 1, "", "decode_img_as_tensor"], [8, 1, 1, "", "read_html"], [8, 1, 1, "", "read_img_as_numpy"], [8, 1, 1, "", "read_img_as_tensor"], [8, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[8, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[8, 2, 1, "", "from_images"], [8, 2, 1, "", "from_pdf"], [8, 2, 1, "", "from_url"]], "doctr.io.Page": [[8, 2, 1, "", "show"]], "doctr.models": [[9, 1, 1, "", "kie_predictor"], [9, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[9, 1, 1, "", "crop_orientation_predictor"], [9, 1, 1, "", "magc_resnet31"], [9, 1, 1, "", "mobilenet_v3_large"], [9, 1, 1, "", "mobilenet_v3_large_r"], [9, 1, 1, "", "mobilenet_v3_small"], [9, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [9, 1, 1, "", "mobilenet_v3_small_page_orientation"], [9, 1, 1, "", "mobilenet_v3_small_r"], [9, 1, 1, "", "page_orientation_predictor"], [9, 1, 1, "", "resnet18"], [9, 1, 1, "", "resnet31"], [9, 1, 1, "", "resnet34"], [9, 1, 1, "", "resnet50"], [9, 1, 1, "", "textnet_base"], [9, 1, 1, "", "textnet_small"], [9, 1, 1, "", "textnet_tiny"], [9, 1, 1, "", "vgg16_bn_r"], [9, 1, 1, "", "vit_b"], [9, 1, 1, "", "vit_s"]], "doctr.models.detection": [[9, 1, 1, "", "db_mobilenet_v3_large"], [9, 1, 1, "", "db_resnet50"], [9, 1, 1, "", "detection_predictor"], [9, 1, 1, "", "fast_base"], [9, 1, 1, "", "fast_small"], [9, 1, 1, "", "fast_tiny"], [9, 1, 1, "", "linknet_resnet18"], [9, 1, 1, "", "linknet_resnet34"], [9, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[9, 1, 1, "", "from_hub"], [9, 1, 1, "", "login_to_hub"], [9, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[9, 1, 1, "", "crnn_mobilenet_v3_large"], [9, 1, 1, "", "crnn_mobilenet_v3_small"], [9, 1, 1, "", "crnn_vgg16_bn"], [9, 1, 1, "", "master"], [9, 1, 1, "", "parseq"], [9, 1, 1, "", "recognition_predictor"], [9, 1, 1, "", "sar_resnet31"], [9, 1, 1, "", "vitstr_base"], [9, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[10, 0, 1, "", "ChannelShuffle"], [10, 0, 1, "", "ColorInversion"], [10, 0, 1, "", "Compose"], [10, 0, 1, "", "GaussianBlur"], [10, 0, 1, "", "GaussianNoise"], [10, 0, 1, "", "LambdaTransformation"], [10, 0, 1, "", "Normalize"], [10, 0, 1, "", "OneOf"], [10, 0, 1, "", "RandomApply"], [10, 0, 1, "", "RandomBrightness"], [10, 0, 1, "", "RandomContrast"], [10, 0, 1, "", "RandomCrop"], [10, 0, 1, "", "RandomGamma"], [10, 0, 1, "", "RandomHorizontalFlip"], [10, 0, 1, "", "RandomHue"], [10, 0, 1, "", "RandomJpegQuality"], [10, 0, 1, "", "RandomResize"], [10, 0, 1, "", "RandomRotate"], [10, 0, 1, "", "RandomSaturation"], [10, 0, 1, "", "RandomShadow"], [10, 0, 1, "", "Resize"], [10, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[11, 0, 1, "", "DetectionMetric"], [11, 0, 1, "", "LocalizationConfusion"], [11, 0, 1, "", "OCRMetric"], [11, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[11, 2, 1, "", "summary"], [11, 2, 1, "", "update"]], "doctr.utils.visualization": [[11, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 8, 9, 11, 15, 18], "0": [2, 4, 7, 10, 11, 13, 16, 17, 19], "00": 19, "01": 19, "0123456789": 7, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "02562": 9, "03": 19, "035": 19, "0361328125": 19, "04": 19, "05": 19, "06": 19, "06640625": 19, "07": 19, "08": [10, 19], "09": 19, "0966796875": 19, "1": [7, 8, 9, 10, 11, 13, 17, 19], "10": [4, 7, 11, 19], "100": [7, 10, 11, 17, 19], "1000": 19, "101": 7, "1024": [9, 13, 19], "104": 7, "106": 7, "108": 7, "1095": 17, "11": 19, "110": 11, "1107": 17, "114": 7, "115": 7, "1156": 17, "116": 7, "118": 7, "11800h": 19, "11th": 19, "12": 19, "120": 7, "123": 7, "126": 7, "1268": 17, "128": [9, 13, 18, 19], "13": 19, "130": 7, "13068": 17, "131": 7, "1337891": 17, "1357421875": 19, "1396484375": 19, "14": 19, "1420": 19, "14470v1": 7, "149": 17, "15": 19, "150": [11, 19], "1552": 19, "16": [9, 18, 19], "1630859375": 19, "1684": 19, "16x16": 9, "17": 19, "1778": 19, "1782": 19, "18": [9, 19], "185546875": 19, "1900": 19, "1910": 9, "19342": 17, "19370": 17, "195": 7, "19598": 17, "199": 19, "1999": 19, "2": [4, 5, 7, 8, 9, 10, 16, 19], "20": 19, "200": 11, "2000": 17, "2003": [5, 7], "2012": 7, "2013": [5, 7], "2015": 7, "2019": 5, "2023": 1, "207901": 17, "21": 19, "2103": 7, "2186": 17, "21888": 17, "22": 19, "224": [9, 10], "225": 10, "22672": 17, "229": [10, 17], "23": 19, "233": 17, "236": 7, "24": 19, "246": 17, "249": 17, "25": 19, "2504": 19, "255": [8, 9, 10, 11, 19], "256": 9, "257": 17, "26": 19, "26032": 17, "264": 13, "27": 19, "2700": 17, "2710": 19, "2749": 13, "28": 19, "287": 13, "29": 19, "296": 13, "299": 13, "2d": 19, "3": [4, 5, 8, 9, 10, 11, 18, 19], "30": 19, "300": 17, "3000": 17, "301": 13, "30595": 19, "30ghz": 19, "31": 9, "32": [7, 9, 10, 13, 17, 18, 19], "3232421875": 19, "33": [10, 19], "33402": 17, "33608": 17, "34": [9, 19], "340": 19, "3456": 19, "3515625": 19, "36": 19, "360": 17, "37": [7, 19], "38": 19, "39": 19, "4": [9, 10, 11, 19], "40": 19, "406": 10, "41": 19, "42": 19, "43": 19, "44": 19, "45": 19, "456": 10, "46": 19, "47": 19, "472": 17, "48": [7, 19], "485": 10, "49": 19, "49377": 17, "5": [7, 10, 11, 16, 19], "50": [9, 17, 19], "51": 19, "51171875": 19, "512": 9, "52": [7, 19], "529": 19, "53": 19, "54": 19, "540": 19, "5478515625": 19, "55": 19, "56": 19, "57": 19, "58": [7, 19], "580": 19, "5810546875": 19, "583": 19, "59": 19, "597": 19, "5k": [5, 7], "5m": 19, "6": [10, 19], "60": 10, "600": [9, 11, 19], "61": 19, "62": 19, "626": 17, "63": 19, "64": [9, 10, 19], "641": 19, "647": 17, "65": 19, "66": 19, "67": 19, "68": 19, "69": 19, "693": 13, "694": 13, "695": 13, "6m": 19, "7": 19, "70": [7, 11, 19], "707470": 17, "71": [7, 19], "7100000": 17, "7141797": 17, "7149": 17, "72": 19, "72dpi": 8, "73": 19, "73257": 17, "74": 19, "75": [10, 19], "7581382": 17, "76": 19, "77": 19, "772": 13, "772875": 17, "78": 19, "785": 13, "79": 19, "793533": 17, "796": 17, "798": 13, "7m": 19, "8": [9, 10, 19], "80": 19, "800": [9, 11, 17, 19], "81": 19, "82": 19, "83": 19, "84": 19, "849": 17, "85": 19, "8564453125": 19, "857": 19, "85875": 17, "86": 19, "8603515625": 19, "87": 19, "8707": 17, "88": 19, "89": 19, "9": [10, 19], "90": 19, "90k": 7, "90kdict32px": 7, "91": 19, "914085328578949": 19, "92": 19, "93": 19, "94": [7, 19], "95": [11, 19], "9578408598899841": 19, "96": 19, "97": 19, "98": 19, "99": 19, "9949972033500671": 19, "A": [2, 3, 5, 7, 8, 9, 12, 18], "As": 3, "Be": 19, "Being": 2, "By": 14, "For": [2, 3, 4, 13, 19], "If": [3, 8, 9, 13, 19], "In": [3, 7, 17], "It": [10, 15, 16, 18], "Its": [5, 9], "No": [2, 19], "Of": 7, "Or": [16, 18], "The": [2, 3, 7, 8, 11, 14, 16, 17, 18, 19], "Then": 9, "To": [3, 4, 14, 15, 16, 18, 19], "_": [2, 7, 9], "__call__": 19, "_build": 3, "_i": 11, "ab": 7, "abc": 18, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 7, "abdef": [7, 17], "abl": [17, 19], "about": [2, 17, 19], "abov": 19, "abstract": 1, "abstractdataset": 7, "abus": 2, "accept": 2, "access": [5, 8, 17, 19], "account": [2, 15], "accur": 19, "accuraci": 11, "achiev": 18, "act": 2, "action": 2, "activ": 5, "ad": [3, 9, 10], "adapt": 2, "add": [10, 11, 15, 19], "add_hook": 19, "add_label": 11, "addit": [3, 4, 8, 16, 19], "addition": [3, 19], "address": [2, 8], "adjust": 10, "advanc": 2, "advantag": 18, "advis": 3, "aesthet": [5, 7], "affect": 2, "after": [15, 19], "ag": 2, "again": 9, "aggreg": [11, 17], "aggress": 2, "align": [2, 8, 10], "all": [2, 3, 6, 7, 8, 10, 11, 16, 17, 19], "allow": [2, 18], "along": 19, "alreadi": [3, 18], "also": [2, 9, 15, 16, 17, 19], "alwai": 17, "an": [2, 3, 5, 7, 8, 9, 11, 16, 18, 19], "analysi": [8, 16], "ancient_greek": 7, "andrej": 1, "angl": [8, 10], "ani": [2, 7, 8, 9, 10, 11, 18, 19], "annot": 7, "anot": 17, "anoth": [9, 13, 17], "answer": 2, "anyascii": 11, "anyon": 5, "anyth": 16, "api": [3, 5], "apolog": 2, "apologi": 2, "app": 3, "appear": 2, "appli": [2, 7, 10], "applic": [5, 9], "appoint": 2, "appreci": 15, "appropri": [2, 3, 19], "ar": [2, 3, 4, 6, 7, 8, 10, 11, 12, 16, 17, 19], "arab": 7, "arabic_diacrit": 7, "arabic_lett": 7, "arabic_punctu": 7, "arbitrarili": [5, 9], "arch": [9, 15], "architectur": [5, 9, 15, 16], "area": 19, "argument": [7, 8, 9, 11, 13, 19], "around": 2, "arrai": [8, 10, 11], "art": [5, 16], "artefact": [11, 16, 19], "artefact_typ": 8, "articl": 1, "artifici": [5, 7], "arxiv": [7, 9], "asarrai": 11, "ascii_lett": 7, "aspect": [5, 9, 10, 19], "assess": 11, "assign": 11, "associ": 8, "assum": 9, "assume_straight_pag": [9, 13, 19], "astyp": [9, 11, 19], "attack": 2, "attend": [5, 9], "attent": [2, 9], "autom": 5, "automat": 19, "autoregress": [5, 9], "avail": [2, 5, 6, 10], "averag": [10, 19], "avoid": [2, 4], "aw": [5, 19], "awar": 19, "azur": 19, "b": [9, 11, 19], "b_j": 11, "back": 3, "backbon": 9, "backend": 19, "background": 17, "bangla": 7, "bar": 16, "bar_cod": 17, "baranovskij": 1, "base": [5, 9, 16], "baselin": [5, 9, 19], "batch": [7, 9, 10, 16, 17, 19], "batch_siz": [7, 9, 13, 16, 17, 18], "bblanchon": 4, "bbox": 19, "becaus": 14, "been": [3, 11, 17, 19], "befor": [7, 9, 10, 19], "begin": 11, "behavior": [2, 19], "being": [11, 19], "belong": 19, "benchmark": 19, "best": [1, 2], "better": [12, 19], "between": [10, 11, 19], "bgr": 8, "bilinear": 10, "bin_thresh": 19, "binar": [5, 9, 19], "binari": [8, 18, 19], "bit": 18, "block": [11, 19], "block_1_1": 19, "blur": 10, "bmvc": 7, "bn": 15, "bodi": [2, 19], "bool": [7, 8, 9, 10, 11], "boolean": [9, 19], "both": [5, 7, 10, 17, 19], "bottom": [9, 19], "bound": [7, 8, 9, 10, 11, 16, 17, 19], "box": [7, 8, 9, 10, 11, 16, 17, 19], "box_thresh": 19, "bright": 10, "browser": [3, 5], "build": [3, 4, 18], "built": 3, "byte": [8, 19], "c": [4, 8, 11], "c_j": 11, "cach": [3, 7, 14], "cache_sampl": 7, "call": 18, "callabl": [7, 10], "can": [3, 4, 13, 14, 15, 16, 17, 19], "capabl": [3, 12, 19], "case": [7, 11], "cf": 19, "cfg": 19, "challeng": 7, "challenge2_test_task12_imag": 7, "challenge2_test_task1_gt": 7, "challenge2_training_task12_imag": 7, "challenge2_training_task1_gt": 7, "chang": [14, 19], "channel": [2, 3, 8, 10], "channel_prior": 4, "channelshuffl": 10, "charact": [5, 7, 8, 11, 17, 19], "charactergener": [7, 17], "characterist": 2, "charg": 19, "charset": 19, "chart": 8, "check": [3, 15, 19], "checkpoint": 9, "chip": 4, "christian": 1, "ci": 3, "clarifi": 2, "clariti": 2, "class": [2, 7, 8, 10, 11, 19], "class_nam": 13, "classif": [17, 19], "classmethod": 8, "clear": 3, "clone": 4, "close": 3, "co": 15, "code": [5, 8, 16], "codecov": 3, "colab": 12, "collate_fn": 7, "collect": [8, 16], "color": 10, "colorinvers": 10, "column": 8, "com": [2, 4, 8, 9, 15], "combin": 19, "command": [3, 16], "comment": 2, "commit": 2, "common": [2, 10, 11, 18], "commun": 2, "compar": 5, "comparison": [11, 19], "competit": 7, "compil": [12, 19], "complaint": 2, "complementari": 11, "complet": 3, "compon": 19, "compos": [7, 19], "comprehens": 19, "comput": [7, 11, 18, 19], "conf_threshold": 16, "confid": [8, 19], "config": [4, 9], "configur": 9, "confus": 11, "consecut": [10, 19], "consequ": 2, "consid": [2, 3, 7, 8, 11, 19], "consist": 19, "consolid": [5, 7], "constant": 10, "construct": 2, "contact": 2, "contain": [1, 6, 7, 12, 17, 19], "content": [7, 8, 19], "context": 9, "contib": 4, "continu": 2, "contrast": 10, "contrast_factor": 10, "contrib": [4, 16], "contribut": 2, "contributor": 3, "convers": 8, "convert": [8, 10], "convolut": 9, "cool": 1, "coordin": [8, 19], "cord": [5, 7, 17, 19], "core": [11, 19], "corner": 19, "correct": 10, "correspond": [4, 8, 10, 19], "could": [2, 16], "counterpart": 11, "cover": 3, "coverag": 3, "cpu": [5, 13, 18], "creat": [1, 15], "crnn": [5, 9, 15], "crnn_mobilenet_v3_larg": [9, 15, 19], "crnn_mobilenet_v3_smal": [9, 18, 19], "crnn_vgg16_bn": [9, 13, 15, 19], "crop": [8, 9, 10, 13, 17, 19], "crop_orient": [8, 19], "crop_orientation_predictor": [9, 13], "crop_param": 13, "cuda": 18, "currenc": 7, "current": [3, 13, 19], "custom": [15, 16, 18, 19], "custom_crop_orientation_model": 13, "custom_page_orientation_model": 13, "customhook": 19, "cvit": 5, "czczup": 9, "czech": 7, "d": [7, 17], "danish": 7, "data": [5, 7, 8, 10, 11, 13, 15], "dataload": 17, "dataset": [9, 13, 19], "dataset_info": 7, "date": [13, 19], "db": 15, "db_mobilenet_v3_larg": [9, 15, 19], "db_resnet34": 19, "db_resnet50": [9, 13, 15, 19], "dbnet": [5, 9], "deal": [12, 19], "decis": 2, "decod": 8, "decode_img_as_tensor": 8, "dedic": 18, "deem": 2, "deep": [9, 19], "def": 19, "default": [4, 8, 13, 14, 19], "defer": 17, "defin": [11, 18], "degre": [8, 10, 19], "degress": 8, "delet": 3, "delimit": 19, "delta": 10, "demo": [3, 5], "demonstr": 2, "depend": [3, 4, 5, 19], "deploi": 3, "deploy": 5, "derogatori": 2, "describ": 9, "descript": 12, "design": 10, "desir": 8, "det_arch": [9, 13, 15, 18], "det_b": 19, "det_model": [13, 15, 18], "det_param": 13, "det_predictor": [13, 19], "detail": [13, 19], "detect": [1, 7, 8, 11, 12, 13, 16], "detect_languag": 9, "detect_orient": [9, 13, 19], "detection_predictor": [9, 19], "detection_task": [7, 17], "detectiondataset": [7, 17], "detectionmetr": 11, "detectionpredictor": [9, 13], "detector": [5, 9, 16], "deterior": 9, "determin": 2, "dev": [3, 14], "develop": 4, "deviat": 10, "devic": 18, "dict": [8, 11, 19], "dictionari": [8, 11], "differ": 2, "differenti": [5, 9], "digit": [5, 7, 17], "dimens": [8, 11, 19], "dimension": 10, "direct": 7, "directli": [15, 19], "directori": [3, 14], "disabl": [2, 14, 19], "disable_crop_orient": 19, "disable_page_orient": 19, "disclaim": 19, "discuss": 3, "disparag": 2, "displai": [8, 11], "display_artefact": 11, "distribut": 10, "div": 19, "divers": 2, "divid": 8, "do": [3, 4, 9], "doc": [3, 8, 16, 18, 19], "docartefact": [7, 17], "docstr": 3, "doctr": [1, 4, 13, 14, 15, 16, 17, 18, 19], "doctr_cache_dir": 14, "doctr_multiprocessing_dis": 14, "document": [1, 7, 9, 11, 12, 13, 16, 17, 18, 19], "documentbuild": 19, "documentfil": [8, 13, 15, 16, 18], "doesn": 18, "don": [13, 19], "done": 10, "download": [7, 17], "downsiz": 9, "draw": 10, "drop": 7, "drop_last": 7, "dtype": [8, 9, 10, 11, 18], "dual": [5, 7], "dummi": 15, "dummy_img": 19, "dummy_input": 18, "dure": 2, "dutch": 7, "dynam": [7, 16], "dynamic_seq_length": 7, "e": [2, 3, 4, 8, 9], "each": [5, 7, 8, 9, 10, 11, 17, 19], "eas": 3, "easi": [5, 11, 15, 18], "easili": [8, 11, 13, 15, 17, 19], "econom": 2, "edit": 2, "educ": 2, "effect": 19, "effici": [3, 5, 7, 9], "either": [11, 19], "element": [7, 8, 9, 19], "els": [3, 16], "email": 2, "empathi": 2, "en": 19, "enabl": [7, 8], "enclos": 8, "encod": [5, 7, 8, 9, 19], "encode_sequ": 7, "encount": 3, "encrypt": 8, "end": [5, 7, 9, 11], "english": [7, 17], "enough": [3, 19], "ensur": 3, "entri": 7, "environ": [2, 14], "eo": 7, "equiv": 19, "estim": 9, "etc": [8, 16], "ethnic": 2, "evalu": [17, 19], "event": 2, "everyon": 2, "everyth": [3, 19], "exact": [11, 19], "exampl": [2, 3, 5, 7, 9, 15, 19], "exchang": 18, "execut": 19, "exist": 15, "expand": 10, "expect": [8, 10, 11], "experi": 2, "explan": [2, 19], "explicit": 2, "exploit": [5, 9], "export": [8, 9, 11, 12, 16, 19], "export_as_straight_box": [9, 19], "export_as_xml": 19, "export_model_to_onnx": 18, "express": [2, 10], "extens": 8, "extern": [2, 17], "extract": [1, 5, 7], "extractor": 9, "f_": 11, "f_a": 11, "factor": 10, "fair": 2, "fairli": 2, "fals": [7, 8, 9, 10, 11, 13, 19], "faq": 2, "fascan": 15, "fast": [5, 7, 9], "fast_bas": [9, 19], "fast_smal": [9, 19], "fast_tini": [9, 19], "faster": [5, 9, 18], "fasterrcnn_mobilenet_v3_large_fpn": 9, "favorit": 19, "featur": [4, 9, 11, 12, 13, 16], "feedback": 2, "feel": [3, 15], "felix92": 15, "few": [18, 19], "figsiz": 11, "figur": [11, 16], "file": [3, 7], "final": 9, "find": [3, 17], "fine": 1, "finnish": 7, "first": [3, 7], "firsthand": 7, "fit": [9, 19], "flag": 19, "flip": 10, "float": [8, 10, 11, 18], "float32": [8, 9, 10, 18], "fn": 10, "focu": 15, "focus": [2, 7], "folder": 7, "follow": [2, 3, 4, 7, 10, 11, 13, 14, 15, 16, 19], "font": 7, "font_famili": 7, "foral": 11, "forc": 3, "forg": 4, "form": [5, 7, 19], "format": [8, 11, 13, 17, 18, 19], "forpost": [5, 7], "forum": 3, "found": 1, "fp16": 18, "frac": 11, "framework": [4, 15, 17, 19], "free": [2, 3, 15], "french": [7, 13, 15, 19], "friendli": 5, "from": [1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19], "from_hub": [9, 15], "from_imag": [8, 15, 16, 18], "from_pdf": 8, "from_url": 8, "full": [7, 11, 19], "function": [7, 10, 11, 16], "funsd": [5, 7, 17, 19], "further": 17, "futur": 7, "g": [8, 9], "g_": 11, "g_x": 11, "gallagh": 1, "gamma": 10, "gaussian": 10, "gaussianblur": 10, "gaussiannois": 10, "gen": 19, "gender": 2, "gener": [3, 5, 8, 9], "generic_cyrillic_lett": 7, "geometri": [5, 8, 19], "geq": 11, "german": [7, 13, 15], "get": [18, 19], "git": 15, "github": [3, 4, 9, 15], "give": [2, 16], "given": [7, 8, 10, 11, 19], "global": 9, "go": 19, "good": 18, "googl": 3, "googlevis": 5, "gpu": [5, 16, 18], "gracefulli": 2, "graph": [5, 7, 8], "grayscal": 10, "ground": 11, "groung": 11, "group": [5, 19], "gt": 11, "gt_box": 11, "gt_label": 11, "guid": 3, "guidanc": 17, "gvision": 19, "h": [8, 9, 10], "h_": 11, "ha": [3, 7, 11, 17], "handl": [12, 17, 19], "handwrit": 7, "handwritten": 17, "harass": 2, "hardwar": 19, "harm": 2, "hat": 11, "have": [2, 3, 11, 13, 15, 17, 18, 19], "head": [9, 19], "healthi": 2, "hebrew": 7, "height": [8, 10], "hello": [11, 19], "help": 18, "here": [6, 10, 12, 16, 17, 19], "hf": 9, "hf_hub_download": 9, "high": 8, "higher": [4, 7, 19], "hindi": 7, "hindi_digit": 7, "hocr": 19, "hook": 19, "horizont": [8, 10, 19], "hous": 7, "how": [1, 3, 12, 13, 15, 17], "howev": 17, "hsv": 10, "html": [2, 3, 4, 8, 19], "http": [2, 4, 7, 8, 9, 15, 19], "hub": 9, "hue": 10, "huggingfac": 9, "hw": 7, "i": [2, 3, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18], "i7": 19, "ibrahimov": 1, "ic03": [5, 7, 17], "ic13": [5, 7, 17], "icdar": [5, 7], "icdar2019": 7, "id": 19, "ident": 2, "identifi": 5, "iiit": [5, 7], "iiit5k": [7, 17], "iiithw": [5, 7, 17], "imag": [1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 19], "imagenet": 9, "imageri": 2, "images_90k_norm": 7, "img": [7, 10, 17, 18], "img_cont": 8, "img_fold": [7, 17], "img_path": 8, "img_transform": 7, "imgur5k": [5, 7, 17], "imgur5k_annot": 7, "imlist": 7, "impact": 2, "implement": [7, 8, 9, 10, 11, 19], "import": [7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19], "improv": 9, "inappropri": 2, "incid": 2, "includ": [2, 7, 17, 18], "inclus": 2, "increas": 10, "independ": 10, "index": [3, 8], "indic": 11, "individu": 2, "infer": [5, 9, 10, 16, 19], "inform": [1, 2, 3, 5, 7, 17], "input": [3, 8, 9, 10, 18, 19], "input_crop": 9, "input_pag": [9, 11, 19], "input_shap": 18, "input_tensor": 9, "inspir": [2, 10], "instal": [15, 16, 18], "instanc": [2, 19], "instanti": [9, 19], "instead": [7, 8, 9], "insult": 2, "int": [7, 8, 9, 10], "int64": 11, "integ": 11, "integr": [1, 5, 15, 17], "intel": 19, "interact": [2, 8, 11], "interfac": [15, 18], "interoper": 18, "interpol": 10, "interpret": [7, 8], "intersect": 11, "invert": 10, "investig": 2, "invis": 2, "involv": [2, 19], "io": [13, 15, 16, 18], "iou": 11, "iou_thresh": 11, "iou_threshold": 16, "irregular": [5, 9, 17], "isn": 7, "issu": [2, 3, 15], "italian": 7, "iter": [7, 10, 17, 19], "its": [8, 9, 10, 11, 17, 19], "itself": [9, 15], "j": 11, "jame": 1, "job": 3, "join": 3, "jpeg": 10, "jpegqual": 10, "jpg": [7, 8, 15, 18], "json": [7, 17, 19], "json_output": 19, "jump": 3, "just": 2, "kei": [5, 7], "kera": [9, 18], "kernel": [5, 9, 10], "kernel_shap": 10, "keywoard": 9, "keyword": [7, 8, 9, 11], "kie": [9, 13], "kie_predictor": [9, 13], "kiepredictor": 9, "kind": 2, "know": [3, 18], "kwarg": [7, 8, 9, 11], "l": 11, "l_j": 11, "label": [7, 11, 16, 17], "label_fil": [7, 17], "label_fold": 7, "label_path": [7, 17], "labels_path": [7, 17], "ladder": 2, "lambda": 10, "lambdatransform": 10, "lang": 19, "languag": [2, 5, 7, 8, 9, 15, 19], "larg": [9, 15], "largest": 11, "last": [4, 7], "latenc": 9, "later": 3, "latest": 19, "latin": 7, "layer": 18, "layout": 19, "lead": 2, "leader": 2, "learn": [2, 5, 9, 18, 19], "least": 4, "left": [11, 19], "legacy_french": 7, "length": [7, 19], "less": [18, 19], "level": [2, 7, 11, 19], "leverag": 12, "lf": 15, "librari": [3, 4, 12, 13], "light": 5, "lightweight": 18, "like": 2, "limits_": 11, "line": [5, 9, 11, 19], "line_1_1": 19, "link": 13, "linknet": [5, 9], "linknet_resnet18": [9, 13, 18, 19], "linknet_resnet34": [9, 18, 19], "linknet_resnet50": [9, 19], "list": [7, 8, 10, 11, 15], "ll": 11, "load": [5, 7, 9, 16, 18], "load_state_dict": 13, "load_weight": 13, "loc_pr": 19, "local": [3, 5, 7, 9, 11, 17, 19], "localis": 7, "localizationconfus": 11, "locat": [3, 8, 19], "login": 9, "login_to_hub": [9, 15], "logo": [8, 16, 17], "love": 15, "lower": [10, 11, 19], "m": [3, 11, 19], "m1": 4, "macbook": 4, "machin": 18, "made": 5, "magc_resnet31": 9, "mai": [2, 3], "mail": 2, "main": 12, "maintain": 5, "mainten": 3, "make": [2, 3, 11, 13, 14, 15, 18, 19], "mani": [17, 19], "manipul": 19, "map": [7, 9], "map_loc": 13, "master": [5, 9, 19], "match": [11, 19], "mathcal": 11, "matplotlib": [8, 11], "max": [7, 10, 11], "max_angl": 10, "max_area": 10, "max_char": [7, 17], "max_delta": 10, "max_gain": 10, "max_gamma": 10, "max_qual": 10, "max_ratio": 10, "maximum": [7, 10], "maxval": [9, 10], "mbox": 11, "mean": [10, 11, 13], "meaniou": 11, "meant": [8, 18], "measur": 19, "media": 2, "median": 9, "meet": 13, "member": 2, "memori": [14, 18], "mention": 19, "merg": 7, "messag": 3, "meta": 19, "metadata": 18, "metal": 4, "method": [8, 10, 19], "metric": [11, 19], "middl": 19, "might": [18, 19], "min": 10, "min_area": 10, "min_char": [7, 17], "min_gain": 10, "min_gamma": 10, "min_qual": 10, "min_ratio": 10, "min_val": 10, "minde": [1, 2, 4, 5, 9], "minim": [3, 5], "minimalist": [5, 9], "minimum": [4, 7, 10, 11, 19], "minval": 10, "miss": 4, "mistak": 2, "mixed_float16": 18, "mixed_precis": 18, "mjsynth": [5, 7, 17], "mnt": 7, "mobilenet": [9, 15], "mobilenet_v3_larg": 9, "mobilenet_v3_large_r": 9, "mobilenet_v3_smal": [9, 13], "mobilenet_v3_small_crop_orient": [9, 13], "mobilenet_v3_small_page_orient": [9, 13], "mobilenet_v3_small_r": 9, "mobilenetv3": 9, "modal": [5, 7], "mode": 4, "model": [7, 11, 14, 16, 17], "model_nam": [9, 15, 18], "model_path": [16, 18], "moder": 2, "modif": 3, "modifi": [9, 14, 19], "modul": [4, 8, 9, 10, 11, 19], "more": [3, 17, 19], "moscardi": 1, "most": 19, "mozilla": 2, "multi": [5, 9], "multilingu": [7, 15], "multipl": [7, 8, 10, 19], "multipli": 10, "multiprocess": 14, "my": 9, "my_awesome_model": 15, "my_hook": 19, "n": [7, 11], "name": [7, 9, 18, 19], "nation": 2, "natur": [2, 5, 7], "ndarrai": [7, 8, 10, 11], "necessari": [4, 13, 14], "need": [3, 4, 7, 11, 13, 14, 15, 16, 19], "neg": 10, "nest": 19, "netraj": 1, "network": [5, 7, 9, 18], "neural": [5, 7, 9, 18], "new": [3, 11], "next": [7, 17], "nois": 10, "noisi": [5, 7], "non": [5, 7, 8, 9, 10, 11], "none": [7, 8, 9, 10, 11, 19], "normal": [9, 10], "norwegian": 7, "note": [0, 3, 7, 9, 13, 15, 16, 18], "now": 3, "np": [9, 10, 11, 19], "num_output_channel": 10, "num_sampl": [7, 17], "number": [7, 9, 10, 11, 19], "numpi": [8, 9, 11, 19], "o": 4, "obb": 16, "obj_detect": 15, "object": [7, 8, 11, 16, 19], "objectness_scor": [8, 19], "oblig": 2, "obtain": 19, "occupi": 18, "ocr": [1, 5, 7, 9, 11, 15], "ocr_carea": 19, "ocr_db_crnn": 11, "ocr_lin": 19, "ocr_pag": 19, "ocr_par": 19, "ocr_predictor": [9, 13, 15, 18, 19], "ocrdataset": [7, 17], "ocrmetr": 11, "ocrpredictor": [9, 13], "ocrx_word": 19, "offens": 2, "offici": [2, 9], "offlin": 2, "offset": 10, "onc": 19, "one": [3, 7, 9, 10, 13, 15, 19], "oneof": 10, "ones": [7, 11], "onli": [3, 9, 10, 11, 13, 15, 17, 18, 19], "onlin": 2, "onnx": 16, "onnxruntim": [16, 18], "onnxtr": 18, "opac": 10, "opacity_rang": 10, "open": [1, 2, 3, 15, 18], "opinion": 2, "optic": [5, 19], "optim": [5, 19], "option": [7, 9, 13], "order": [3, 7, 8, 10], "org": [2, 7, 9, 19], "organ": 8, "orient": [2, 8, 9, 12, 16, 19], "orientationpredictor": 9, "other": [2, 3], "otherwis": [2, 8, 11], "our": [1, 3, 9, 19], "out": [3, 9, 10, 11, 19], "outpout": 19, "output": [8, 10, 18], "output_s": [8, 10], "outsid": 14, "over": [7, 11, 19], "overal": [2, 9], "overlai": 8, "overview": 16, "overwrit": 13, "overwritten": 15, "own": 5, "p": [10, 19], "packag": [3, 5, 11, 14, 16, 17, 18], "pad": [7, 9, 10, 19], "page": [4, 7, 9, 11, 13, 19], "page1": 8, "page2": 8, "page_1": 19, "page_idx": [8, 19], "page_orientation_predictor": [9, 13], "page_param": 13, "pair": 11, "paper": 9, "par_1_1": 19, "paragraph": 19, "paragraph_break": 19, "parallel": 9, "param": [10, 19], "paramet": [5, 8, 9, 18], "pars": [5, 7], "parseq": [5, 9, 15, 18, 19], "part": [7, 10, 19], "parti": 4, "partial": 19, "particip": 2, "pass": [7, 8, 9, 13, 19], "password": 8, "patch": [9, 11], "path": [7, 8, 16, 17, 18], "path_to_checkpoint": 13, "path_to_custom_model": 18, "path_to_pt": 13, "patil": 1, "pattern": 2, "pdf": [8, 9, 12], "pdfpage": 8, "peopl": 2, "per": [10, 19], "perform": [5, 8, 9, 10, 11, 14, 18, 19], "period": 2, "permiss": 2, "permut": [5, 9], "persian_lett": 7, "person": [2, 17], "phase": 19, "photo": 17, "physic": [2, 8], "pick": 10, "pictur": 8, "pip": [3, 4, 16, 18], "pipelin": 19, "pixel": [8, 10, 19], "pleas": 3, "plot": 11, "plt": 11, "plug": 15, "plugin": 4, "png": 8, "point": 18, "polici": 14, "polish": 7, "polit": 2, "polygon": [7, 11, 19], "pool": 9, "portugues": 7, "posit": [2, 11], "possibl": [3, 11, 15, 19], "post": [2, 19], "postprocessor": 19, "potenti": 9, "power": 5, "ppageno": 19, "pre": [3, 9, 18], "precis": [11, 19], "pred": 11, "pred_box": 11, "pred_label": 11, "predefin": 17, "predict": [8, 9, 11, 19], "predictor": [5, 8, 9, 12, 13, 15, 18], "prefer": 17, "preinstal": 4, "preprocessor": [13, 19], "prerequisit": 15, "present": 12, "preserv": [9, 10, 19], "preserve_aspect_ratio": [8, 9, 10, 13, 19], "pretrain": [5, 9, 11, 13, 18, 19], "pretrained_backbon": [9, 13], "print": 19, "prior": 7, "privaci": 2, "privat": 2, "probabl": [1, 10], "problem": 3, "procedur": 10, "process": [3, 5, 8, 9, 13, 19], "processor": 19, "produc": [12, 19], "product": 18, "profession": 2, "project": [3, 17], "promptli": 2, "proper": 3, "properli": 7, "provid": [2, 3, 5, 15, 16, 17, 19], "public": [2, 5], "publicli": 19, "publish": 2, "pull": 15, "punctuat": 7, "pure": 7, "purpos": 3, "push_to_hf_hub": [9, 15], "py": 15, "pypdfium2": [4, 8], "pyplot": [8, 11], "python": [1, 3, 16], "python3": 15, "pytorch": [4, 5, 9, 10, 13, 15, 18, 19], "q": 3, "qr": [8, 16], "qr_code": 17, "qualiti": 10, "question": 2, "quickli": 5, "quicktour": 12, "r": 19, "race": 2, "ramdisk": 7, "rand": [9, 10, 11, 18, 19], "random": [9, 10, 11, 19], "randomappli": 10, "randombright": 10, "randomcontrast": 10, "randomcrop": 10, "randomgamma": 10, "randomhorizontalflip": 10, "randomhu": 10, "randomjpegqu": 10, "randomli": 10, "randomres": 10, "randomrot": 10, "randomsatur": 10, "randomshadow": 10, "rang": 10, "rassi": 15, "ratio": [9, 10, 19], "raw": [8, 11], "re": 18, "read": [5, 7, 9], "read_html": 8, "read_img_as_numpi": 8, "read_img_as_tensor": 8, "read_pdf": 8, "readi": 18, "real": [1, 5, 9, 10], "realli": 1, "reason": [2, 5, 7], "rebuild": 3, "rebuilt": 3, "recal": [11, 19], "receipt": [5, 7, 19], "reco_arch": [9, 13, 15, 18], "reco_b": 19, "reco_model": [13, 15, 18], "reco_param": 13, "reco_predictor": 13, "recogn": 19, "recognit": [7, 11, 12, 13], "recognition_predictor": [9, 19], "recognition_task": [7, 17], "recognitiondataset": [7, 17], "recognitionpredictor": [9, 13], "rectangular": 9, "reduc": [4, 10], "refer": [3, 4, 13, 15, 16, 17, 19], "regardless": 2, "region": 19, "regroup": 11, "regular": 17, "reject": 2, "rel": [8, 10, 11, 19], "relat": 8, "releas": [0, 4], "relev": 16, "religion": 2, "remov": 2, "render": [8, 19], "repo": 9, "repo_id": [9, 15], "report": 2, "repositori": [7, 9, 15], "repres": [2, 18, 19], "represent": [5, 9], "request": [2, 15], "requir": [4, 10, 18], "research": 5, "residu": 9, "resiz": [10, 19], "resnet": 9, "resnet18": [9, 15], "resnet31": 9, "resnet34": 9, "resnet50": [9, 15], "resolv": 8, "resolve_block": 19, "resolve_lin": 19, "resourc": 17, "respect": 2, "rest": [3, 10, 11], "restrict": 14, "result": [3, 7, 8, 12, 15, 18, 19], "return": 19, "reusabl": 19, "review": 2, "rgb": [8, 10], "rgb_mode": 8, "rgb_output": 8, "right": [2, 9, 11], "roboflow": 1, "robust": [5, 7], "root": 7, "rotat": [7, 8, 9, 10, 11, 12, 13, 17, 19], "run": [3, 4, 9], "same": [3, 8, 11, 17, 18, 19], "sampl": [7, 9, 17, 19], "sample_transform": 7, "sanjin": 1, "sar": [5, 9], "sar_resnet31": [9, 19], "satur": 10, "save": [9, 17], "scale": [8, 9, 10, 11], "scale_rang": 10, "scan": [5, 7], "scene": [5, 7, 9], "score": [8, 11], "script": [3, 17], "seamless": 5, "seamlessli": [5, 19], "search": [1, 9], "searchabl": 12, "sec": 19, "second": 19, "section": [1, 13, 15, 16, 18, 19], "secur": [2, 14], "see": [2, 3], "seen": 19, "segment": [5, 9, 19], "self": 19, "semant": [5, 9], "send": 19, "sens": 11, "sensit": 17, "separ": 19, "sequenc": [5, 7, 8, 9, 11, 19], "sequenti": [10, 19], "seri": 2, "seriou": 2, "set": [2, 4, 7, 9, 11, 14, 16, 19], "set_global_polici": 18, "sever": [8, 10, 19], "sex": 2, "sexual": 2, "shade": 10, "shape": [5, 8, 9, 10, 11, 19], "share": [14, 17], "shift": 10, "shm": 14, "should": [3, 7, 8, 10, 11], "show": [5, 8, 9, 11, 13, 15, 16], "showcas": [3, 12], "shuffl": [7, 10], "side": 11, "signatur": 8, "signific": 17, "simpl": [5, 9, 18], "simpler": 9, "sinc": [7, 17], "singl": [2, 3, 5, 7], "single_img_doc": 18, "size": [2, 7, 8, 10, 16, 19], "skew": 19, "slack": 3, "slightli": 9, "small": [3, 9, 19], "smallest": 8, "snapshot_download": 9, "snippet": 19, "so": [3, 4, 7, 9, 15, 17], "social": 2, "socio": 2, "some": [1, 4, 12, 15, 17], "someth": 3, "somewher": 3, "sort": 2, "sourc": [1, 7, 8, 9, 10, 11, 15], "space": [2, 19], "span": 19, "spanish": 7, "spatial": [5, 7, 8], "specif": [3, 4, 11, 13, 17, 19], "specifi": [2, 7, 8], "speed": [5, 9, 19], "sphinx": 3, "sroie": [5, 7, 17], "stabl": 4, "stackoverflow": 3, "stage": 5, "standalon": 12, "standard": 10, "start": 7, "state": [1, 5, 11, 16], "static": 11, "statist": 1, "statu": 2, "std": [10, 13], "step": 14, "still": 19, "str": [7, 8, 9, 10, 11], "straight": [7, 9, 17, 19], "straighten": 19, "straighten_pag": [9, 13, 19], "straigten_pag": 13, "stream": 8, "street": [5, 7], "strict": 4, "strictli": 11, "string": [7, 8, 11, 19], "strive": 4, "strong": [5, 9], "structur": [18, 19], "subset": [7, 19], "suggest": [3, 15], "sum": 11, "summari": 11, "support": [4, 13, 16, 18, 19], "sustain": 2, "svhn": [5, 7, 17], "svt": [7, 17], "swedish": 7, "symmetr": [9, 10, 19], "symmetric_pad": [9, 10, 19], "synthet": 5, "synthtext": [5, 7, 17], "system": 19, "t": [3, 7, 13, 18, 19], "tabl": [15, 16, 17], "take": [2, 7, 19], "target": [7, 8, 10, 11, 17], "target_s": 7, "task": [5, 7, 9, 15, 17, 19], "task2": 7, "team": 4, "techminde": 4, "templat": [3, 5], "tensor": [7, 8, 10, 19], "tensorflow": [4, 5, 8, 9, 10, 13, 15, 18, 19], "tensorspec": 18, "term": 2, "test": [7, 17], "test_set": 7, "text": [1, 7, 8, 9, 11, 17], "text_output": 19, "textmatch": 11, "textnet": 9, "textnet_bas": 9, "textnet_smal": 9, "textnet_tini": 9, "textract": [5, 19], "textstylebrush": [5, 7], "textual": [5, 7, 8, 9, 19], "tf": [4, 8, 9, 10, 15, 18], "than": [3, 11, 15], "thank": 3, "thei": [2, 11], "them": [7, 19], "thi": [1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 15, 17, 18, 19], "thing": [18, 19], "third": 4, "those": [2, 8, 19], "threaten": 2, "threshold": 19, "through": [2, 10, 16, 17], "tilman": 15, "time": [1, 2, 5, 9, 11, 17], "tini": 9, "titl": [8, 19], "tm": 19, "tmp": 14, "togeth": [3, 8], "tograi": 10, "tool": [1, 17], "top": [11, 18, 19], "topic": 3, "torch": [4, 10, 13, 15, 18], "torchvis": 10, "total": 13, "toward": [2, 4], "train": [3, 7, 9, 10, 15, 16, 17, 18, 19], "train_it": [7, 17], "train_load": [7, 17], "train_pytorch": 15, "train_set": [7, 17], "train_tensorflow": 15, "trainabl": [5, 9], "tranform": 10, "transcrib": 19, "transfer": [5, 7], "transfo": 10, "transform": [5, 7, 9], "translat": 2, "troll": 2, "true": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19], "truth": 11, "tune": [1, 18], "tupl": [7, 8, 10, 11], "two": [8, 14], "txt": 7, "type": [8, 11, 15, 18, 19], "typic": 19, "u": [2, 3], "ucsd": 7, "udac": 3, "uint8": [8, 9, 11, 19], "ukrainian": 7, "unaccept": 2, "underli": [17, 19], "underneath": 8, "understand": [5, 7, 19], "uniform": [9, 10], "uniformli": 10, "uninterrupt": [8, 19], "union": 11, "unit": 1, "unittest": 3, "unlock": 8, "unoffici": 9, "unprofession": 2, "unsolicit": 2, "unsupervis": 5, "unwelcom": 2, "up": [9, 19], "updat": 11, "upgrad": 3, "upper": [7, 10], "uppercas": 17, "url": 8, "us": [2, 3, 4, 7, 9, 11, 12, 13, 14, 15, 16, 19], "usabl": 19, "usag": [14, 18], "use_polygon": [7, 11, 17], "useabl": 19, "user": [5, 8, 12], "utf": 19, "util": 18, "v1": 15, "v3": [9, 15, 19], "valid": 17, "valu": [3, 8, 10, 19], "valuabl": 5, "variabl": 14, "varieti": 7, "veri": 9, "verma": 1, "version": [2, 3, 4, 18, 19], "vgg": 9, "vgg16": 15, "vgg16_bn_r": 9, "via": 2, "video": 1, "vietnames": 7, "view": [5, 7], "viewpoint": 2, "violat": 2, "visibl": 2, "vision": [5, 7, 9], "visiondataset": 7, "visiontransform": 9, "visual": [4, 5, 16], "visualize_pag": 11, "vit_": 9, "vit_b": 9, "vitstr": [5, 9, 18], "vitstr_bas": [9, 19], "vitstr_smal": [9, 13, 18, 19], "viz": 4, "vocab": [13, 15, 17, 18, 19], "vocabulari": [7, 13, 15], "w": [8, 9, 10, 11], "w3": 19, "wa": 2, "wai": [2, 5, 17], "want": [3, 18, 19], "warmup": 19, "wasn": 3, "we": [1, 2, 3, 4, 5, 8, 10, 13, 15, 17, 18, 19], "weasyprint": 8, "web": [3, 8], "websit": 7, "welcom": 2, "well": [1, 2, 18], "were": [2, 8, 19], "what": [1, 2], "when": [2, 3, 9], "whenev": 3, "where": [3, 8, 10, 11], "whether": [3, 7, 8, 10, 11, 17, 19], "which": [2, 9, 14, 16, 17, 19], "whichev": 4, "while": [10, 19], "why": 2, "width": [8, 10], "wiki": 2, "wildreceipt": [5, 7, 17], "window": [9, 11], "wish": 3, "within": 2, "without": [2, 7, 9], "wonder": 3, "word": [5, 7, 9, 11, 19], "word_1_1": 19, "word_1_2": 19, "word_1_3": 19, "wordgener": [7, 17], "words_onli": 11, "work": [1, 13, 14, 19], "workflow": 3, "worklow": 3, "world": [11, 19], "worth": 9, "wrap": 19, "wrapper": [7, 10], "write": 14, "written": [2, 8], "www": [2, 8, 19], "x": [8, 10, 11], "x_ascend": 19, "x_descend": 19, "x_i": 11, "x_size": 19, "x_wconf": 19, "xhtml": 19, "xmax": 8, "xmin": 8, "xml": 19, "xml_bytes_str": 19, "xml_element": 19, "xml_output": 19, "xmln": 19, "y": 11, "y_i": 11, "y_j": 11, "yet": 16, "ymax": 8, "ymin": 8, "yolov8": 16, "you": [3, 4, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "your": [3, 5, 8, 11, 19], "yoursit": 8, "yugesh": 1, "zero": [10, 11], "zoo": 13, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 7, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 7, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 7, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 7, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 7, "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 7, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 7, "\u00e4\u00f6\u00e4\u00f6": 7, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 7, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 7, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 7, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 7, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": 7, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": 7, "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": 7, "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 7, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 7, "\u067e\u0686\u06a2\u06a4\u06af": 7, "\u0905": 7, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 7, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 7, "\u0950": 7, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 7, "\u09bd": 7, "\u09ce": 7, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 7}, "titles": ["Changelog", "Community resources", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 3, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 2], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 2], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 2], "31": 0, "4": [0, 2], "5": 0, "6": 0, "7": 0, "8": 0, "9": 0, "advanc": 19, "approach": 19, "architectur": 19, "arg": [7, 8, 9, 10, 11], "artefact": 8, "artefactdetect": 16, "attribut": 2, "avail": [16, 17, 19], "aw": 14, "ban": 2, "block": 8, "bug": 3, "changelog": 0, "choos": [17, 19], "classif": [9, 13, 15], "code": [2, 3], "codebas": 3, "commit": 3, "commun": [1, 15], "compos": 10, "conda": 4, "conduct": 2, "connect": 3, "continu": 3, "contrib": 6, "contribut": [3, 6, 16], "contributor": 2, "convent": 15, "correct": 2, "coven": 2, "custom": [7, 13], "data": 17, "dataload": 7, "dataset": [5, 7, 17], "detect": [5, 9, 15, 17, 19], "develop": 3, "do": 19, "doctr": [3, 5, 6, 7, 8, 9, 10, 11, 12], "document": [3, 5, 8], "end": 19, "enforc": 2, "evalu": 11, "export": 18, "factori": 9, "featur": [3, 5], "feedback": 3, "file": 8, "from": 15, "gener": [7, 17], "git": 4, "guidelin": 2, "half": 18, "hub": 15, "huggingfac": 15, "i": 19, "infer": 18, "instal": [3, 4], "integr": [3, 16], "io": 8, "lambda": 14, "let": 3, "line": 8, "linux": 4, "load": [13, 15, 17], "loader": 7, "main": 5, "mode": 3, "model": [5, 9, 13, 15, 18, 19], "modifi": 3, "modul": [6, 16], "name": 15, "notebook": 12, "object": 17, "ocr": [17, 19], "onli": 4, "onnx": 18, "optim": 18, "option": 19, "orient": 13, "our": 2, "output": 19, "own": [13, 17], "packag": 4, "page": 8, "perman": 2, "pipelin": 16, "pledg": 2, "precis": 18, "predictor": 19, "prepar": 18, "prerequisit": 4, "pretrain": 15, "push": 15, "python": 4, "qualiti": 3, "question": 3, "read": 8, "readi": 17, "recognit": [5, 9, 15, 17, 19], "report": 3, "request": 3, "resourc": 1, "respons": 2, "return": [7, 8, 9, 11], "right": 19, "scope": 2, "share": 15, "should": 19, "stage": 19, "standard": 2, "structur": [3, 8], "style": 3, "support": [5, 6, 7, 10], "synthet": [7, 17], "task": 11, "temporari": 2, "test": 3, "text": [5, 19], "train": 13, "transform": 10, "two": 19, "unit": 3, "us": [17, 18], "util": 11, "v0": 0, "verif": 3, "via": 4, "visual": 11, "vocab": 7, "warn": 2, "what": 19, "word": 8, "your": [13, 15, 16, 17, 18], "zoo": [5, 9]}}) \ No newline at end of file diff --git a/v0.6.0/transforms.html b/v0.6.0/transforms.html deleted file mode 100644 index 85e94d8a76..0000000000 --- a/v0.6.0/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.5)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[Callable[[Any], Any]])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[Callable[[Any], Any]])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: Callable[[Any], Any], p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.6.0/using_doctr/custom_models_training.html b/v0.6.0/using_doctr/custom_models_training.html index df39d8d568..b714c1f971 100644 --- a/v0.6.0/using_doctr/custom_models_training.html +++ b/v0.6.0/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -619,7 +619,7 @@

Loading your custom trained orientation classification model - + diff --git a/v0.6.0/using_doctr/running_on_aws.html b/v0.6.0/using_doctr/running_on_aws.html index 16ceaca7a1..808ea541cd 100644 --- a/v0.6.0/using_doctr/running_on_aws.html +++ b/v0.6.0/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -362,7 +362,7 @@

AWS Lambda - + diff --git a/v0.6.0/using_doctr/sharing_models.html b/v0.6.0/using_doctr/sharing_models.html index d76b4017f4..c9e978400a 100644 --- a/v0.6.0/using_doctr/sharing_models.html +++ b/v0.6.0/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -544,7 +544,7 @@

Recognition - + diff --git a/v0.6.0/using_doctr/using_contrib_modules.html b/v0.6.0/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.6.0/using_doctr/using_contrib_modules.html +++ b/v0.6.0/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.6.0/using_doctr/using_datasets.html b/v0.6.0/using_doctr/using_datasets.html index 460476dbbf..8a7d4f0a64 100644 --- a/v0.6.0/using_doctr/using_datasets.html +++ b/v0.6.0/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -642,7 +642,7 @@

Data Loading - + diff --git a/v0.6.0/using_doctr/using_model_export.html b/v0.6.0/using_doctr/using_model_export.html index 6124c00ebe..6790dd0642 100644 --- a/v0.6.0/using_doctr/using_model_export.html +++ b/v0.6.0/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -467,7 +467,7 @@

Using your ONNX exported model - + diff --git a/v0.6.0/using_doctr/using_models.html b/v0.6.0/using_doctr/using_models.html index 61f1f5ab7a..9ead8498e1 100644 --- a/v0.6.0/using_doctr/using_models.html +++ b/v0.6.0/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1253,7 +1253,7 @@

Advanced options - + diff --git a/v0.6.0/utils.html b/v0.6.0/utils.html deleted file mode 100644 index e2f223f06a..0000000000 --- a/v0.6.0/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float | None, float | None, float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/datasets/cord.html b/v0.7.0/_modules/doctr/datasets/cord.html index 46e00abe77..0e7141c9cf 100644 --- a/v0.7.0/_modules/doctr/datasets/cord.html +++ b/v0.7.0/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -445,7 +445,7 @@

Source code for doctr.datasets.cord

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/core.html b/v0.7.0/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.7.0/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.7.0/_modules/doctr/datasets/datasets/tensorflow.html deleted file mode 100644 index a236abd9fe..0000000000 --- a/v0.7.0/_modules/doctr/datasets/datasets/tensorflow.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - - - - - doctr.datasets.datasets.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.datasets.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from typing import List, Any, Tuple
-import tensorflow as tf
-
-from .base import _AbstractDataset, _VisionDataset
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset(_AbstractDataset):
-
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-
-        return img, target
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset, _VisionDataset): - pass
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/datasets/detection.html b/v0.7.0/_modules/doctr/datasets/detection.html index e7009409d8..520a53e311 100644 --- a/v0.7.0/_modules/doctr/datasets/detection.html +++ b/v0.7.0/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -421,7 +421,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/doc_artefacts.html b/v0.7.0/_modules/doctr/datasets/doc_artefacts.html index 906edfbd02..007c71c365 100644 --- a/v0.7.0/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.7.0/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -407,7 +407,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.7.0/_modules/doctr/datasets/funsd.html b/v0.7.0/_modules/doctr/datasets/funsd.html index 578b6ac937..1ac37edbc6 100644 --- a/v0.7.0/_modules/doctr/datasets/funsd.html +++ b/v0.7.0/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -439,7 +439,7 @@

Source code for doctr.datasets.funsd

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/generator/tensorflow.html b/v0.7.0/_modules/doctr/datasets/generator/tensorflow.html index 97764a5b1b..4ad437bd89 100644 --- a/v0.7.0/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.7.0/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -387,7 +387,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/datasets/ic03.html b/v0.7.0/_modules/doctr/datasets/ic03.html index e394592496..bbaae590fa 100644 --- a/v0.7.0/_modules/doctr/datasets/ic03.html +++ b/v0.7.0/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -451,7 +451,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/ic13.html b/v0.7.0/_modules/doctr/datasets/ic13.html index 8cc7c5d9d1..d2c344d276 100644 --- a/v0.7.0/_modules/doctr/datasets/ic13.html +++ b/v0.7.0/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -424,7 +424,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/iiit5k.html b/v0.7.0/_modules/doctr/datasets/iiit5k.html index ad0c8a6a63..8506e634b4 100644 --- a/v0.7.0/_modules/doctr/datasets/iiit5k.html +++ b/v0.7.0/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -427,7 +427,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/iiithws.html b/v0.7.0/_modules/doctr/datasets/iiithws.html index cb80799b4d..56de57d502 100644 --- a/v0.7.0/_modules/doctr/datasets/iiithws.html +++ b/v0.7.0/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -400,7 +400,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/imgur5k.html b/v0.7.0/_modules/doctr/datasets/imgur5k.html index b6bb0a7a6c..c29f733ae2 100644 --- a/v0.7.0/_modules/doctr/datasets/imgur5k.html +++ b/v0.7.0/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -472,7 +472,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/loader.html b/v0.7.0/_modules/doctr/datasets/loader.html index 58b250220e..40fb3525de 100644 --- a/v0.7.0/_modules/doctr/datasets/loader.html +++ b/v0.7.0/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -426,7 +426,7 @@

Source code for doctr.datasets.loader

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/mjsynth.html b/v0.7.0/_modules/doctr/datasets/mjsynth.html index c46f57431c..da0b98a607 100644 --- a/v0.7.0/_modules/doctr/datasets/mjsynth.html +++ b/v0.7.0/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -431,7 +431,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/ocr.html b/v0.7.0/_modules/doctr/datasets/ocr.html index 1c8c1ed153..8582c04ea4 100644 --- a/v0.7.0/_modules/doctr/datasets/ocr.html +++ b/v0.7.0/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -396,7 +396,7 @@

Source code for doctr.datasets.ocr

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/recognition.html b/v0.7.0/_modules/doctr/datasets/recognition.html index 99e48e8086..941dbea77c 100644 --- a/v0.7.0/_modules/doctr/datasets/recognition.html +++ b/v0.7.0/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -381,7 +381,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/sroie.html b/v0.7.0/_modules/doctr/datasets/sroie.html index d9eb3c6f9b..84f2df9766 100644 --- a/v0.7.0/_modules/doctr/datasets/sroie.html +++ b/v0.7.0/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -428,7 +428,7 @@

Source code for doctr.datasets.sroie

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/svhn.html b/v0.7.0/_modules/doctr/datasets/svhn.html index 59fefb9738..ddef2456c5 100644 --- a/v0.7.0/_modules/doctr/datasets/svhn.html +++ b/v0.7.0/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -456,7 +456,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/svt.html b/v0.7.0/_modules/doctr/datasets/svt.html index 7cbeddf891..a3fde4dc2f 100644 --- a/v0.7.0/_modules/doctr/datasets/svt.html +++ b/v0.7.0/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -442,7 +442,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/synthtext.html b/v0.7.0/_modules/doctr/datasets/synthtext.html index f92c9fcf3e..fa5189063b 100644 --- a/v0.7.0/_modules/doctr/datasets/synthtext.html +++ b/v0.7.0/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -453,7 +453,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/utils.html b/v0.7.0/_modules/doctr/datasets/utils.html index ab7f6e75e1..6057b2e54e 100644 --- a/v0.7.0/_modules/doctr/datasets/utils.html +++ b/v0.7.0/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -526,7 +526,7 @@

Source code for doctr.datasets.utils

     
   
- + diff --git a/v0.7.0/_modules/doctr/datasets/wildreceipt.html b/v0.7.0/_modules/doctr/datasets/wildreceipt.html index c543ee7cac..12c6aebd14 100644 --- a/v0.7.0/_modules/doctr/datasets/wildreceipt.html +++ b/v0.7.0/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.7.0/_modules/doctr/documents/elements.html b/v0.7.0/_modules/doctr/documents/elements.html deleted file mode 100644 index 10c1e142d2..0000000000 --- a/v0.7.0/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,577 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox, RotatedBbox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator, arg-type] - - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/documents/reader.html b/v0.7.0/_modules/doctr/documents/reader.html deleted file mode 100644 index cdcd814b6c..0000000000 --- a/v0.7.0/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,612 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence, Dict
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/io/elements.html b/v0.7.0/_modules/doctr/io/elements.html index f9743f9c90..2621746349 100644 --- a/v0.7.0/_modules/doctr/io/elements.html +++ b/v0.7.0/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -943,7 +943,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.7.0/_modules/doctr/io/html.html b/v0.7.0/_modules/doctr/io/html.html index 43f83891e1..16fdac956a 100644 --- a/v0.7.0/_modules/doctr/io/html.html +++ b/v0.7.0/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -352,7 +352,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.7.0/_modules/doctr/io/image/base.html b/v0.7.0/_modules/doctr/io/image/base.html index 4cb926010b..9d386f7e77 100644 --- a/v0.7.0/_modules/doctr/io/image/base.html +++ b/v0.7.0/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -381,7 +381,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.7.0/_modules/doctr/io/image/tensorflow.html b/v0.7.0/_modules/doctr/io/image/tensorflow.html index 81a9c6db3c..7be0d18e8e 100644 --- a/v0.7.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.7.0/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -434,7 +434,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.7.0/_modules/doctr/io/pdf.html b/v0.7.0/_modules/doctr/io/pdf.html index e5d9a0b5d0..4f5af28982 100644 --- a/v0.7.0/_modules/doctr/io/pdf.html +++ b/v0.7.0/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -367,7 +367,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.7.0/_modules/doctr/io/reader.html b/v0.7.0/_modules/doctr/io/reader.html index 299779cf2c..66cb1f947a 100644 --- a/v0.7.0/_modules/doctr/io/reader.html +++ b/v0.7.0/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -406,7 +406,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.7.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.7.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html index fb26f46a98..018c4f3df6 100644 --- a/v0.7.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -515,7 +515,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.7.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6d63a1299e..2eb4b47bbd 100644 --- a/v0.7.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -738,7 +738,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.7.0/_modules/doctr/models/classification/resnet/tensorflow.html index 6383146867..d0e05415df 100644 --- a/v0.7.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -723,7 +723,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.7.0/_modules/doctr/models/classification/textnet/tensorflow.html index ad254ebbfb..c5567d7d67 100644 --- a/v0.7.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -611,7 +611,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.7.0/_modules/doctr/models/classification/vgg/tensorflow.html index dbd2845713..0f24518c3a 100644 --- a/v0.7.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -436,7 +436,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.7.0/_modules/doctr/models/classification/vit/tensorflow.html index 05a2a2ca0c..5b7b117dc6 100644 --- a/v0.7.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -515,7 +515,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/classification/zoo.html b/v0.7.0/_modules/doctr/models/classification/zoo.html index 7c6beed9b2..8c361d3bb9 100644 --- a/v0.7.0/_modules/doctr/models/classification/zoo.html +++ b/v0.7.0/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -395,7 +395,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.7.0/_modules/doctr/models/detection/differentiable_binarization.html b/v0.7.0/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.7.0/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.7.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index f7e27fdc68..90b457edb2 100644 --- a/v0.7.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -712,7 +712,7 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo

- + diff --git a/v0.7.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.7.0/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.7.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/detection/linknet.html b/v0.7.0/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.7.0/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.7.0/_modules/doctr/models/detection/linknet/tensorflow.html index ec15d41068..c36f166f89 100644 --- a/v0.7.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -721,7 +721,7 @@

Source code for doctr.models.detection.linknet.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/detection/zoo.html b/v0.7.0/_modules/doctr/models/detection/zoo.html index e90b7350c5..1e47b8e170 100644 --- a/v0.7.0/_modules/doctr/models/detection/zoo.html +++ b/v0.7.0/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -421,7 +421,7 @@

Source code for doctr.models.detection.zoo

     
   
- + diff --git a/v0.7.0/_modules/doctr/models/export.html b/v0.7.0/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.7.0/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/models/factory/hub.html b/v0.7.0/_modules/doctr/models/factory/hub.html index 1f713fbcd3..cc1d94b666 100644 --- a/v0.7.0/_modules/doctr/models/factory/hub.html +++ b/v0.7.0/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -562,7 +562,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.7.0/_modules/doctr/models/recognition/crnn.html b/v0.7.0/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.7.0/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.7.0/_modules/doctr/models/recognition/crnn/tensorflow.html index 54ecbff9e6..ee00292e89 100644 --- a/v0.7.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -642,7 +642,7 @@

Source code for doctr.models.recognition.crnn.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.7.0/_modules/doctr/models/recognition/master/tensorflow.html index 90a8655a25..383001a7c3 100644 --- a/v0.7.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -632,7 +632,7 @@

Source code for doctr.models.recognition.master.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.7.0/_modules/doctr/models/recognition/parseq/tensorflow.html index ff75ce9e87..6a62b57ec3 100644 --- a/v0.7.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -826,7 +826,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/recognition/sar.html b/v0.7.0/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.7.0/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.7.0/_modules/doctr/models/recognition/sar/tensorflow.html index 242708ee64..b657759fc4 100644 --- a/v0.7.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -744,7 +744,7 @@

Source code for doctr.models.recognition.sar.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.7.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 775a5943cd..8ed22fb349 100644 --- a/v0.7.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -599,7 +599,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/models/recognition/zoo.html b/v0.7.0/_modules/doctr/models/recognition/zoo.html index 902f7b7903..0d405abf4d 100644 --- a/v0.7.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.7.0/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -399,7 +399,7 @@

Source code for doctr.models.recognition.zoo

   
- + diff --git a/v0.7.0/_modules/doctr/models/zoo.html b/v0.7.0/_modules/doctr/models/zoo.html index 635257609e..524106f5dd 100644 --- a/v0.7.0/_modules/doctr/models/zoo.html +++ b/v0.7.0/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -552,7 +552,7 @@

Source code for doctr.models.zoo

     
   
- + diff --git a/v0.7.0/_modules/doctr/transforms/modules.html b/v0.7.0/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.7.0/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/transforms/modules/base.html b/v0.7.0/_modules/doctr/transforms/modules/base.html index c175ed68da..64e32f3dde 100644 --- a/v0.7.0/_modules/doctr/transforms/modules/base.html +++ b/v0.7.0/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -608,7 +608,7 @@

Source code for doctr.transforms.modules.base

- + diff --git a/v0.7.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.7.0/_modules/doctr/transforms/modules/tensorflow.html index df60d47514..0f165b47a4 100644 --- a/v0.7.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.7.0/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -880,7 +880,7 @@

Source code for doctr.transforms.modules.tensorflow

- + diff --git a/v0.7.0/_modules/doctr/utils/metrics.html b/v0.7.0/_modules/doctr/utils/metrics.html index e76fef948f..655a7c61a7 100644 --- a/v0.7.0/_modules/doctr/utils/metrics.html +++ b/v0.7.0/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -1057,7 +1057,7 @@

Source code for doctr.utils.metrics

     
   
- + diff --git a/v0.7.0/_modules/doctr/utils/visualization.html b/v0.7.0/_modules/doctr/utils/visualization.html index 38c5c5a857..3f295d2cf5 100644 --- a/v0.7.0/_modules/doctr/utils/visualization.html +++ b/v0.7.0/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -807,7 +807,7 @@

Source code for doctr.utils.visualization

     
   
- + diff --git a/v0.7.0/_modules/index.html b/v0.7.0/_modules/index.html index a55cff678b..8d79322bb9 100644 --- a/v0.7.0/_modules/index.html +++ b/v0.7.0/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -369,7 +369,7 @@

All modules for which code is available

- + diff --git a/v0.7.0/_sources/datasets.rst.txt b/v0.7.0/_sources/datasets.rst.txt deleted file mode 100644 index 354122f1e5..0000000000 --- a/v0.7.0/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.datasets.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.7.0/_sources/documents.rst.txt b/v0.7.0/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.7.0/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.7.0/_sources/installing.rst.txt b/v0.7.0/_sources/installing.rst.txt deleted file mode 100644 index 5c8779dc1c..0000000000 --- a/v0.7.0/_sources/installing.rst.txt +++ /dev/null @@ -1,46 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: - -* TensorFlow: `installation page `_. -* PyTorch: `installation page `_. - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.7.0/_sources/models.rst.txt b/v0.7.0/_sources/models.rst.txt deleted file mode 100644 index 9830c6c153..0000000000 --- a/v0.7.0/_sources/models.rst.txt +++ /dev/null @@ -1,215 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet16 - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 -.. autofunction:: doctr.models.recognition.master - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.7.0/_sources/transforms.rst.txt b/v0.7.0/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.7.0/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.7.0/_sources/utils.rst.txt b/v0.7.0/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.7.0/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.7.0/_static/basic.css b/v0.7.0/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.7.0/_static/basic.css +++ b/v0.7.0/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.7.0/_static/doctools.js b/v0.7.0/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.7.0/_static/doctools.js +++ b/v0.7.0/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.7.0/_static/language_data.js b/v0.7.0/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.7.0/_static/language_data.js +++ b/v0.7.0/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.7.0/_static/searchtools.js b/v0.7.0/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.7.0/_static/searchtools.js +++ b/v0.7.0/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.7.0/changelog.html b/v0.7.0/changelog.html index 84b977cf6b..f68335ccd2 100644 --- a/v0.7.0/changelog.html +++ b/v0.7.0/changelog.html @@ -14,7 +14,7 @@ - + Changelog - docTR documentation @@ -415,7 +415,7 @@

v0.1.0 (2021-03-05) - + diff --git a/v0.7.0/community/resources.html b/v0.7.0/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.7.0/community/resources.html +++ b/v0.7.0/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.7.0/contributing/code_of_conduct.html b/v0.7.0/contributing/code_of_conduct.html index c138b6694b..46e37028be 100644 --- a/v0.7.0/contributing/code_of_conduct.html +++ b/v0.7.0/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -498,7 +498,7 @@

Attribution - + diff --git a/v0.7.0/contributing/contributing.html b/v0.7.0/contributing/contributing.html index fa79ab0bf9..5e77704cd2 100644 --- a/v0.7.0/contributing/contributing.html +++ b/v0.7.0/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -475,7 +475,7 @@

Let’s connect - + diff --git a/v0.7.0/datasets.html b/v0.7.0/datasets.html deleted file mode 100644 index 193e576c57..0000000000 --- a/v0.7.0/datasets.html +++ /dev/null @@ -1,578 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
  • sos – optional encoding of Start Of String

  • -
  • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/documents.html b/v0.7.0/documents.html deleted file mode 100644 index 98cbb2c5ef..0000000000 --- a/v0.7.0/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/genindex.html b/v0.7.0/genindex.html index 488b287a3c..fe751c2c46 100644 --- a/v0.7.0/genindex.html +++ b/v0.7.0/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -734,7 +734,7 @@

W

- + diff --git a/v0.7.0/getting_started/installing.html b/v0.7.0/getting_started/installing.html index 9cefe0dd45..2401ccb3e3 100644 --- a/v0.7.0/getting_started/installing.html +++ b/v0.7.0/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -422,7 +422,7 @@

Via Git - + diff --git a/v0.7.0/index.html b/v0.7.0/index.html index 6f6ba7496b..ff6712b40d 100644 --- a/v0.7.0/index.html +++ b/v0.7.0/index.html @@ -14,7 +14,7 @@ - + docTR documentation @@ -435,7 +435,7 @@

Supported datasets - + diff --git a/v0.7.0/installing.html b/v0.7.0/installing.html deleted file mode 100644 index b61c60134b..0000000000 --- a/v0.7.0/installing.html +++ /dev/null @@ -1,395 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

- -

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/models.html b/v0.7.0/models.html deleted file mode 100644 index b5cd44c9fa..0000000000 --- a/v0.7.0/models.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet16
->>> model = linknet16(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
-

MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. -Example:

-
>>> import tensorflow as tf
->>> from doctr.models import master
->>> model = master(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

59.50

62.50

75.30

70.00

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

66.00

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/modules/contrib.html b/v0.7.0/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.7.0/modules/contrib.html +++ b/v0.7.0/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.7.0/modules/datasets.html b/v0.7.0/modules/datasets.html index 54975cb877..1bf096233d 100644 --- a/v0.7.0/modules/datasets.html +++ b/v0.7.0/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -997,7 +997,7 @@

Dataloader - + diff --git a/v0.7.0/modules/io.html b/v0.7.0/modules/io.html index c44084b35a..4da4f1e597 100644 --- a/v0.7.0/modules/io.html +++ b/v0.7.0/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -726,7 +726,7 @@

File reading - + diff --git a/v0.7.0/modules/models.html b/v0.7.0/modules/models.html index b2cdfc1f6e..0a479d67cb 100644 --- a/v0.7.0/modules/models.html +++ b/v0.7.0/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1222,7 +1222,7 @@

doctr.models.factory - + diff --git a/v0.7.0/modules/transforms.html b/v0.7.0/modules/transforms.html index a81fa8cb7d..485046a896 100644 --- a/v0.7.0/modules/transforms.html +++ b/v0.7.0/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -803,7 +803,7 @@

Composing transformations - + diff --git a/v0.7.0/modules/utils.html b/v0.7.0/modules/utils.html index e8529c4f80..bce9a29e76 100644 --- a/v0.7.0/modules/utils.html +++ b/v0.7.0/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -733,7 +733,7 @@

Visualization - + diff --git a/v0.7.0/notebooks.html b/v0.7.0/notebooks.html index f484efbf80..8b0a78272b 100644 --- a/v0.7.0/notebooks.html +++ b/v0.7.0/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -379,7 +379,7 @@

docTR Notebooks - + diff --git a/v0.7.0/py-modindex.html b/v0.7.0/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.7.0/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.7.0/search.html b/v0.7.0/search.html index 975e2efd79..5243968edf 100644 --- a/v0.7.0/search.html +++ b/v0.7.0/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -334,7 +334,7 @@ - + diff --git a/v0.7.0/searchindex.js b/v0.7.0/searchindex.js index efd2407926..1e43f55781 100644 --- a/v0.7.0/searchindex.js +++ b/v0.7.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id5"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet18_rotation() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18_rotation", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet18_rotation"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": 16, "02562": 7, "03": 16, "035": [], "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": [], "106": [], "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": [], "115": [], "1156": 14, "116": 5, "118": [], "11800h": 16, "11th": 16, "12": [3, 16], "120": [], "123": 5, "126": 5, "1268": [], "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": [], "149": 14, "15": 16, "150": [9, 16], "154": [], "1552": 16, "16": [7, 15], "160": [], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": [7, 16], "185546875": 16, "19": [], "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": [], "19598": [], "199": 16, "1999": 16, "1m": [], "2": [3, 4, 5, 6, 8, 16], "20": [], "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2021": [], "2023": [], "207901": 14, "21": 16, "2103": [], "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": [], "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": [7, 16], "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": 8, "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": 16, "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": [], "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": [], "48": [5, 16], "485": 8, "49": [], "49377": [], "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "533": [], "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "595": [], "597": 16, "5k": [4, 5], "5m": [], "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "611": [], "62": 16, "625": [], "626": 14, "629": [], "63": 16, "630": [], "64": [7, 8, 16], "640": [], "641": 16, "647": 14, "65": 16, "66": 16, "660": [], "664": [], "666": [], "67": 16, "672": [], "68": 16, "689": [], "69": 16, "693": 11, "694": 11, "695": 11, "6m": [], "7": 16, "70": [9, 16], "700": [], "701": [], "702": [], "707470": 14, "71": 16, "7100000": 14, "713": [], "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "733": [], "74": 16, "745": [], "75": [8, 16], "753": [], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "780": [], "781": [], "783": [], "785": 11, "789": [], "79": 16, "793533": 14, "796": 14, "798": 11, "7m": [], "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "817": [], "82": 16, "8275l": [], "83": 16, "830": [], "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "860": [], "8603515625": 16, "862": [], "863": [], "87": 16, "8707": 14, "875": [], "88": 16, "89": 16, "8m": [], "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "913": [], "914085328578949": 16, "917": [], "92": 16, "921": [], "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": 16, "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "And": [], "As": 2, "Be": [], "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": [], "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": [], "_build": 2, "_i": 9, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accent": [], "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": [], "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13], "add_hook": [], "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": [], "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "andrej": [], "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": [], "arch": [7, 13], "architectur": [4, 7, 13], "archiv": [], "area": 16, "arg": [5, 7], "argument": [5, 7, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "articl": [], "artifici": [4, 5], "arxiv": 7, "as_imag": [], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autoclass": [], "autom": 4, "automat": [], "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": [], "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "baranovskij": [], "base": [4, 7], "baselin": [4, 7, 16], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": [], "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": 1, "being": [9, 16], "belong": 16, "benchmark": 16, "best": 1, "beta": [], "better": [10, 16], "between": [8, 9], "bgr": 6, "bilinear": 8, "bin_thresh": [], "binar": [4, 7], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": [], "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [6, 9], "c5": [], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "changelog": [], "channel": [1, 2, 6, 8], "channel_prior": [], "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "christian": [], "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": 7, "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "conv_sequ": [], "convers": 6, "convert": [6, 8], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 7, "cool": [], "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_resnet31": [], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": 13, "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": 4, "czczup": [], "czech": 5, "d": [5, 14], "daili": [], "danish": [], "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": 16, "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": [], "default": [6, 9, 11, 12], "defer": 14, "defin": [9, 15], "deform": [], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": 11, "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": [], "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "developp": [], "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disk": [], "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distanc": [], "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": [], "documentfil": [6, 13], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": [], "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easier": [], "easili": [6, 9, 11, 13, 14, 16], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enough": [2, 16], "ensur": 2, "entir": [], "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "error": [], "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exactmatch": [], "exampl": [1, 2, 4, 5, 7, 13], "exchang": 15, "exclud": [], "execut": [], "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extract_arch": [], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": [], "fast": [4, 5, 7], "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": 15, "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feed": [], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15], "figsiz": 9, "figur": 9, "file": [2, 5], "file_hash": [], "file_nam": [], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": [], "first": 2, "firsthand": 5, "fit": [7, 16], "fitz": [], "flag": 16, "flexibl": [], "flip": 8, "float": [6, 8, 9, 15], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": [], "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "found": [], "fp": [], "fp16": 15, "frac": 9, "frame": [], "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "fulli": [], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gallagh": [], "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11], "get": 16, "get_artefact": [], "get_word": [], "gettextword": [], "git": 13, "github": [2, 3, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": 6, "grayscal": 8, "ground": 9, "groung": 9, "group": 4, "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "half": [], "handl": 14, "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": [], "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": [], "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 16], "http": [1, 3, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ibrahimov": [], "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": [], "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inform": [1, 2, 4, 5, 14], "inherit": [], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_t": [], "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "invoic": [], "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "issu": [1, 2, 13], "italian": [], "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "jame": [], "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "kei": [], "kera": [7, 15], "kernel": 8, "kernel_s": [], "kernel_shap": 8, "keywoard": [], "keyword": [5, 7], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": [1, 16], "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": 5, "less": 15, "let": [], "letter": [], "level": [1, 5, 9, 16], "levenshtein": [], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet16": [], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [7, 16], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": 3, "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loader": [], "loc_pr": [], "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "love": 13, "lower": [8, 9], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": [], "map": 5, "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": 9, "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_dist": [], "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": [], "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [8, 16], "metric": [9, 16], "middl": [], "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": [], "minimum": [3, 5, 8, 9], "minval": 8, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "modal": [], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12], "modul": [6, 8, 9, 16], "moment": 16, "more": [2, 9, 14, 16], "moscardi": [], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [], "multipl": [5, 6, 8], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": [], "n": [5, 9], "na": [], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": 16, "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13], "neg": 8, "nest": 16, "nestedobject": [], "netraj": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "newer": [], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": [], "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": 1, "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "opinion": 1, "optic": [4, 16], "optim": 4, "option": [5, 11], "order": [2, 5, 6, 8], "org": [1, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": [], "paragraph_break": [], "parallel": [], "param": [8, 16], "paramet": [4, 5, 6, 7, 8, 9, 15], "pars": [4, 5], "parseq": [4, 7, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": 7, "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "patil": [], "pattern": 1, "pdf": [6, 7, 10], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": [], "pixbuf": 3, "pixel": [6, 8, 16], "platinum": [], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": [], "polit": 1, "polygon": [5, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13], "post": [1, 16], "postprocessor": [], "potenti": 7, "power": 4, "ppageno": 16, "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "properti": [], "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pypdfium2": 6, "pyplot": 9, "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": [], "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": 6, "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "realli": [], "reason": 1, "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "recurr": [], "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": [], "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "relu": [], "remov": 1, "render": 6, "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": [], "resolve_lin": [], "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "return": [5, 6, 7, 9, 16], "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "roboflow": [], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "rotated_bbox": [], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sanjin": [], "sar": [4, 7], "sar_resnet31": [7, 16], "sar_vgg16_bn": [], "satur": 8, "save": [7, 14], "saved_model": [], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "scheme": [], "score": 9, "scratch": [], "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seemlessli": [], "seen": 16, "segment": [4, 7, 16], "self": [], "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": 8, "seri": 1, "serial": [], "serialized_model": [], "seriou": 1, "set": [1, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "sha256": [], "shade": 8, "shape": [6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": 1, "span": 16, "spanish": 5, "spatial": [6, 9], "special": [], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standalon": [], "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": [], "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": [], "symbol": [], "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "task": [4, 5, 7, 13, 14, 16], "task2": 5, "team": [], "techminde": [], "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": 14, "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": [], "textmatch": 9, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "tf_model": [], "tflite": [], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": [], "through": [1, 8, 14], "tilman": [], "time": [1, 4, 7, 9, 14], "tini": [], "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "turn": [], "two": [6, 12], "txt": 5, "type": [6, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": 14, "underneath": 6, "understand": [4, 5, 16], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unit": [], "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v0": [], "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "veri": 7, "verifi": [], "verma": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn_r": 7, "via": 1, "video": [], "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warm": [], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": [], "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x12larg": [], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xeon": [], "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "yugesh": [], "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": [], "2024": [], "21": [], "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": [], "8": [], "9": [], "advanc": [], "approach": 16, "architectur": 16, "arg": [], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "ban": 1, "block": 6, "bug": 2, "build": [], "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "compress": [], "conda": [], "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "from": 13, "gener": [5, 14], "get": [], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "implement": [], "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "linux": [], "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "note": [], "notebook": 10, "object": 14, "ocr": 16, "onli": [], "onnx": 15, "optim": 15, "option": [], "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "post": [], "pre": [], "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "process": [], "push": 13, "python": 3, "qualiti": 2, "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "refer": [], "report": 2, "request": 2, "resourc": [], "respons": 1, "return": [], "right": 16, "savedmodel": [], "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "start": [], "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id5"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet18_rotation() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18_rotation", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet18_rotation"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": 16, "02562": 7, "03": 16, "035": [], "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": [], "106": [], "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": [], "115": [], "1156": 14, "116": 5, "118": [], "11800h": 16, "11th": 16, "12": [3, 16], "120": [], "123": 5, "126": 5, "1268": [], "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": [], "149": 14, "15": 16, "150": [9, 16], "1552": 16, "16": [7, 15], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": [7, 16], "185546875": 16, "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": [], "19598": [], "199": 16, "1999": 16, "2": [3, 4, 5, 6, 8, 16], "20": [], "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2023": [], "207901": 14, "21": 16, "2103": [], "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": [], "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": [7, 16], "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": 8, "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": 16, "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": [], "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": [], "48": [5, 16], "485": 8, "49": [], "49377": [], "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "597": 16, "5k": [4, 5], "5m": [], "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "62": 16, "626": 14, "63": 16, "64": [7, 8, 16], "641": 16, "647": 14, "65": 16, "66": 16, "67": 16, "68": 16, "69": 16, "693": 11, "694": 11, "695": 11, "6m": [], "7": 16, "70": [9, 16], "707470": 14, "71": 16, "7100000": 14, "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "74": 16, "75": [8, 16], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "785": 11, "79": 16, "793533": 14, "796": 14, "798": 11, "7m": [], "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "82": 16, "83": 16, "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "8603515625": 16, "87": 16, "8707": 14, "88": 16, "89": 16, "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "914085328578949": 16, "92": 16, "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": 16, "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "As": 2, "Be": [], "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": [], "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": [], "_build": 2, "_i": 9, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": [], "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13], "add_hook": [], "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": [], "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "andrej": [], "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": [], "arch": [7, 13], "architectur": [4, 7, 13], "area": 16, "arg": [5, 7], "argument": [5, 7, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "articl": [], "artifici": [4, 5], "arxiv": 7, "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autom": 4, "automat": [], "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": [], "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "baranovskij": [], "base": [4, 7], "baselin": [4, 7, 16], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": [], "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": 1, "being": [9, 16], "belong": 16, "benchmark": 16, "best": 1, "better": [10, 16], "between": [8, 9], "bgr": 6, "bilinear": 8, "bin_thresh": [], "binar": [4, 7], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": [], "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [6, 9], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "channel": [1, 2, 6, 8], "channel_prior": [], "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "christian": [], "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": 7, "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "convers": 6, "convert": [6, 8], "convolut": 7, "cool": [], "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": 13, "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": 4, "czczup": [], "czech": 5, "d": [5, 14], "danish": [], "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": 16, "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": [], "default": [6, 9, 11, 12], "defer": 14, "defin": [9, 15], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": 11, "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": [], "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": [], "documentfil": [6, 13], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": [], "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easili": [6, 9, 11, 13, 14, 16], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enough": [2, 16], "ensur": 2, "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exampl": [1, 2, 4, 5, 7, 13], "exchang": 15, "execut": [], "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": [], "fast": [4, 5, 7], "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": 15, "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15], "figsiz": 9, "figur": 9, "file": [2, 5], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": [], "first": 2, "firsthand": 5, "fit": [7, 16], "flag": 16, "flip": 8, "float": [6, 8, 9, 15], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": [], "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "found": [], "fp16": 15, "frac": 9, "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gallagh": [], "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11], "get": 16, "git": 13, "github": [2, 3, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": 6, "grayscal": 8, "ground": 9, "groung": 9, "group": 4, "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "handl": 14, "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": [], "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": [], "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 16], "http": [1, 3, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ibrahimov": [], "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": [], "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inform": [1, 2, 4, 5, 14], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "issu": [1, 2, 13], "italian": [], "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "jame": [], "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "kei": [], "kera": [7, 15], "kernel": 8, "kernel_shap": 8, "keywoard": [], "keyword": [5, 7], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": [1, 16], "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": 5, "less": 15, "level": [1, 5, 9, 16], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [7, 16], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": 3, "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loc_pr": [], "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "love": 13, "lower": [8, 9], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": [], "map": 5, "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": 9, "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": [], "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [8, 16], "metric": [9, 16], "middl": [], "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": [], "minimum": [3, 5, 8, 9], "minval": 8, "miss": 3, "mistak": 1, "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "modal": [], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12], "modul": [6, 8, 9, 16], "moment": 16, "more": [2, 9, 14, 16], "moscardi": [], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [], "multipl": [5, 6, 8], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": [], "n": [5, 9], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": 16, "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13], "neg": 8, "nest": 16, "netraj": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": [], "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": 1, "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "opinion": 1, "optic": [4, 16], "optim": 4, "option": [5, 11], "order": [2, 5, 6, 8], "org": [1, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": [], "paragraph_break": [], "parallel": [], "param": [8, 16], "paramet": [4, 5, 6, 7, 8, 9, 15], "pars": [4, 5], "parseq": [4, 7, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": 7, "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "patil": [], "pattern": 1, "pdf": [6, 7, 10], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": [], "pixbuf": 3, "pixel": [6, 8, 16], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": [], "polit": 1, "polygon": [5, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13], "post": [1, 16], "postprocessor": [], "potenti": 7, "power": 4, "ppageno": 16, "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pypdfium2": 6, "pyplot": 9, "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": [], "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": 6, "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "realli": [], "reason": 1, "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": [], "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "remov": 1, "render": 6, "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": [], "resolve_lin": [], "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "return": [5, 6, 7, 9, 16], "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "roboflow": [], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sanjin": [], "sar": [4, 7], "sar_resnet31": [7, 16], "satur": 8, "save": [7, 14], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "score": 9, "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seen": 16, "segment": [4, 7, 16], "self": [], "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": 8, "seri": 1, "seriou": 1, "set": [1, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "shade": 8, "shape": [6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": 1, "span": 16, "spanish": 5, "spatial": [6, 9], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standalon": [], "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": [], "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": [], "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "task": [4, 5, 7, 13, 14, 16], "task2": 5, "team": [], "techminde": [], "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": 14, "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": [], "textmatch": 9, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": [], "through": [1, 8, 14], "tilman": [], "time": [1, 4, 7, 9, 14], "tini": [], "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "two": [6, 12], "txt": 5, "type": [6, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": 14, "underneath": 6, "understand": [4, 5, 16], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unit": [], "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "veri": 7, "verma": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn_r": 7, "via": 1, "video": [], "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": [], "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "yugesh": [], "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": [], "2024": [], "21": [], "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": [], "8": [], "9": [], "advanc": [], "approach": 16, "architectur": 16, "arg": [], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "ban": 1, "block": 6, "bug": 2, "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "conda": [], "conduct": 1, "connect": 2, "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "from": 13, "gener": [5, 14], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "linux": [], "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "notebook": 10, "object": 14, "ocr": 16, "onli": [], "onnx": 15, "optim": 15, "option": [], "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "push": 13, "python": 3, "qualiti": 2, "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "report": 2, "request": 2, "resourc": [], "respons": 1, "return": [], "right": 16, "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file diff --git a/v0.7.0/transforms.html b/v0.7.0/transforms.html deleted file mode 100644 index 85e94d8a76..0000000000 --- a/v0.7.0/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.5)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[Callable[[Any], Any]])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[Callable[[Any], Any]])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: Callable[[Any], Any], p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.7.0/using_doctr/custom_models_training.html b/v0.7.0/using_doctr/custom_models_training.html index 7cb776b28f..6273492fc9 100644 --- a/v0.7.0/using_doctr/custom_models_training.html +++ b/v0.7.0/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -545,7 +545,7 @@

Loading your custom trained model - + diff --git a/v0.7.0/using_doctr/running_on_aws.html b/v0.7.0/using_doctr/running_on_aws.html index 207347602a..a52e3bb5f7 100644 --- a/v0.7.0/using_doctr/running_on_aws.html +++ b/v0.7.0/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -356,7 +356,7 @@

AWS Lambda - + diff --git a/v0.7.0/using_doctr/sharing_models.html b/v0.7.0/using_doctr/sharing_models.html index 45a53f9c5c..6a5ca6611e 100644 --- a/v0.7.0/using_doctr/sharing_models.html +++ b/v0.7.0/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -528,7 +528,7 @@

Recognition - + diff --git a/v0.7.0/using_doctr/using_contrib_modules.html b/v0.7.0/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.7.0/using_doctr/using_contrib_modules.html +++ b/v0.7.0/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.7.0/using_doctr/using_datasets.html b/v0.7.0/using_doctr/using_datasets.html index 594b518886..2c62da97af 100644 --- a/v0.7.0/using_doctr/using_datasets.html +++ b/v0.7.0/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -613,7 +613,7 @@

Data Loading - + diff --git a/v0.7.0/using_doctr/using_model_export.html b/v0.7.0/using_doctr/using_model_export.html index 0129f7c861..5c5cbe84ce 100644 --- a/v0.7.0/using_doctr/using_model_export.html +++ b/v0.7.0/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -436,7 +436,7 @@

Using your ONNX exported model in docTR - + diff --git a/v0.7.0/using_doctr/using_models.html b/v0.7.0/using_doctr/using_models.html index 261b6bab62..4e8dd95a8f 100644 --- a/v0.7.0/using_doctr/using_models.html +++ b/v0.7.0/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1112,7 +1112,7 @@

What should I do with the output? - + diff --git a/v0.7.0/utils.html b/v0.7.0/utils.html deleted file mode 100644 index e2f223f06a..0000000000 --- a/v0.7.0/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float | None, float | None, float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/datasets/cord.html b/v0.8.0/_modules/doctr/datasets/cord.html index 354f0062c2..85f1a47a08 100644 --- a/v0.8.0/_modules/doctr/datasets/cord.html +++ b/v0.8.0/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.datasets.cord

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/core.html b/v0.8.0/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.8.0/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.8.0/_modules/doctr/datasets/datasets/tensorflow.html deleted file mode 100644 index a236abd9fe..0000000000 --- a/v0.8.0/_modules/doctr/datasets/datasets/tensorflow.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - - - - - doctr.datasets.datasets.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.datasets.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from typing import List, Any, Tuple
-import tensorflow as tf
-
-from .base import _AbstractDataset, _VisionDataset
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset(_AbstractDataset):
-
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-
-        return img, target
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset, _VisionDataset): - pass
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/datasets/detection.html b/v0.8.0/_modules/doctr/datasets/detection.html index faf9256c89..706b89a562 100644 --- a/v0.8.0/_modules/doctr/datasets/detection.html +++ b/v0.8.0/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -424,7 +424,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/doc_artefacts.html b/v0.8.0/_modules/doctr/datasets/doc_artefacts.html index 886999868b..dc8e8f9c29 100644 --- a/v0.8.0/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.8.0/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -408,7 +408,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.8.0/_modules/doctr/datasets/funsd.html b/v0.8.0/_modules/doctr/datasets/funsd.html index 60f7e51592..6f7ab121f0 100644 --- a/v0.8.0/_modules/doctr/datasets/funsd.html +++ b/v0.8.0/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.funsd

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/generator/tensorflow.html b/v0.8.0/_modules/doctr/datasets/generator/tensorflow.html index fecf8b2d82..814dc0822d 100644 --- a/v0.8.0/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.8.0/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -389,7 +389,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/datasets/ic03.html b/v0.8.0/_modules/doctr/datasets/ic03.html index 83f7bcddf0..cf8999d751 100644 --- a/v0.8.0/_modules/doctr/datasets/ic03.html +++ b/v0.8.0/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -452,7 +452,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/ic13.html b/v0.8.0/_modules/doctr/datasets/ic13.html index 1d92d10349..7650af381c 100644 --- a/v0.8.0/_modules/doctr/datasets/ic13.html +++ b/v0.8.0/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -425,7 +425,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/iiit5k.html b/v0.8.0/_modules/doctr/datasets/iiit5k.html index 14ab1db716..b4a54e7e22 100644 --- a/v0.8.0/_modules/doctr/datasets/iiit5k.html +++ b/v0.8.0/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -429,7 +429,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/iiithws.html b/v0.8.0/_modules/doctr/datasets/iiithws.html index e7c0d4e8dd..052a85cd56 100644 --- a/v0.8.0/_modules/doctr/datasets/iiithws.html +++ b/v0.8.0/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -401,7 +401,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/imgur5k.html b/v0.8.0/_modules/doctr/datasets/imgur5k.html index eb12e48784..f6c1a4692c 100644 --- a/v0.8.0/_modules/doctr/datasets/imgur5k.html +++ b/v0.8.0/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/loader.html b/v0.8.0/_modules/doctr/datasets/loader.html index cdaec1bb70..9b2b3126de 100644 --- a/v0.8.0/_modules/doctr/datasets/loader.html +++ b/v0.8.0/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -428,7 +428,7 @@

Source code for doctr.datasets.loader

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/mjsynth.html b/v0.8.0/_modules/doctr/datasets/mjsynth.html index d7a7e66e35..c95f99e6d5 100644 --- a/v0.8.0/_modules/doctr/datasets/mjsynth.html +++ b/v0.8.0/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -432,7 +432,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/ocr.html b/v0.8.0/_modules/doctr/datasets/ocr.html index c6e09faee3..a1a249b259 100644 --- a/v0.8.0/_modules/doctr/datasets/ocr.html +++ b/v0.8.0/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -397,7 +397,7 @@

Source code for doctr.datasets.ocr

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/recognition.html b/v0.8.0/_modules/doctr/datasets/recognition.html index 1e14da06a9..95612cdadb 100644 --- a/v0.8.0/_modules/doctr/datasets/recognition.html +++ b/v0.8.0/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -382,7 +382,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/sroie.html b/v0.8.0/_modules/doctr/datasets/sroie.html index f3ac7b9547..32b4b17983 100644 --- a/v0.8.0/_modules/doctr/datasets/sroie.html +++ b/v0.8.0/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -429,7 +429,7 @@

Source code for doctr.datasets.sroie

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/svhn.html b/v0.8.0/_modules/doctr/datasets/svhn.html index f10a8cfd8e..5633dcfd6c 100644 --- a/v0.8.0/_modules/doctr/datasets/svhn.html +++ b/v0.8.0/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -457,7 +457,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/svt.html b/v0.8.0/_modules/doctr/datasets/svt.html index 0d64efedf4..0ed4482c50 100644 --- a/v0.8.0/_modules/doctr/datasets/svt.html +++ b/v0.8.0/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -443,7 +443,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/synthtext.html b/v0.8.0/_modules/doctr/datasets/synthtext.html index 333de06da8..edd5c63c80 100644 --- a/v0.8.0/_modules/doctr/datasets/synthtext.html +++ b/v0.8.0/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/utils.html b/v0.8.0/_modules/doctr/datasets/utils.html index 6e90a6400d..eeee0b2654 100644 --- a/v0.8.0/_modules/doctr/datasets/utils.html +++ b/v0.8.0/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -542,7 +542,7 @@

Source code for doctr.datasets.utils

     
   
- + diff --git a/v0.8.0/_modules/doctr/datasets/wildreceipt.html b/v0.8.0/_modules/doctr/datasets/wildreceipt.html index 2b386ae694..6b5a52a10e 100644 --- a/v0.8.0/_modules/doctr/datasets/wildreceipt.html +++ b/v0.8.0/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -437,7 +437,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.8.0/_modules/doctr/documents/elements.html b/v0.8.0/_modules/doctr/documents/elements.html deleted file mode 100644 index 10c1e142d2..0000000000 --- a/v0.8.0/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,577 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox, RotatedBbox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator, arg-type] - - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/documents/reader.html b/v0.8.0/_modules/doctr/documents/reader.html deleted file mode 100644 index cdcd814b6c..0000000000 --- a/v0.8.0/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,612 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence, Dict
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/io/elements.html b/v0.8.0/_modules/doctr/io/elements.html index 78ea4cc7cf..a8d52c457f 100644 --- a/v0.8.0/_modules/doctr/io/elements.html +++ b/v0.8.0/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -960,7 +960,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.8.0/_modules/doctr/io/html.html b/v0.8.0/_modules/doctr/io/html.html index a1eb075da0..34a60da286 100644 --- a/v0.8.0/_modules/doctr/io/html.html +++ b/v0.8.0/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -354,7 +354,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.8.0/_modules/doctr/io/image/base.html b/v0.8.0/_modules/doctr/io/image/base.html index 1b42de0506..54663fa868 100644 --- a/v0.8.0/_modules/doctr/io/image/base.html +++ b/v0.8.0/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -382,7 +382,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.8.0/_modules/doctr/io/image/tensorflow.html b/v0.8.0/_modules/doctr/io/image/tensorflow.html index 02325e0630..cf030207d4 100644 --- a/v0.8.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.8.0/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -439,7 +439,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.8.0/_modules/doctr/io/pdf.html b/v0.8.0/_modules/doctr/io/pdf.html index 7d82b6573c..7dcb3e2381 100644 --- a/v0.8.0/_modules/doctr/io/pdf.html +++ b/v0.8.0/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -368,7 +368,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.8.0/_modules/doctr/io/reader.html b/v0.8.0/_modules/doctr/io/reader.html index 5a8c87d168..5568ce7e0f 100644 --- a/v0.8.0/_modules/doctr/io/reader.html +++ b/v0.8.0/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.8.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.8.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 1b97d83911..4dd332b464 100644 --- a/v0.8.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -518,7 +518,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.8.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index b583e184fa..7dbc971810 100644 --- a/v0.8.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -747,7 +747,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.8.0/_modules/doctr/models/classification/resnet/tensorflow.html index 67c7ede371..77a5747d8b 100644 --- a/v0.8.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -730,7 +730,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.8.0/_modules/doctr/models/classification/textnet/tensorflow.html index a36ebab4f6..45bcea9658 100644 --- a/v0.8.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -599,7 +599,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.8.0/_modules/doctr/models/classification/vgg/tensorflow.html index 57e34af78f..8dc381674b 100644 --- a/v0.8.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -439,7 +439,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.8.0/_modules/doctr/models/classification/vit/tensorflow.html index 717a6d1649..84d68b5388 100644 --- a/v0.8.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -521,7 +521,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/classification/zoo.html b/v0.8.0/_modules/doctr/models/classification/zoo.html index 87f2d2956d..d1f749776e 100644 --- a/v0.8.0/_modules/doctr/models/classification/zoo.html +++ b/v0.8.0/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -400,7 +400,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.8.0/_modules/doctr/models/detection/differentiable_binarization.html b/v0.8.0/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.8.0/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.8.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index d8f6168c33..bbc634a899 100644 --- a/v0.8.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -735,7 +735,7 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo

- + diff --git a/v0.8.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.8.0/_modules/doctr/models/detection/fast/tensorflow.html index 5b84d2dea1..65e1a77af8 100644 --- a/v0.8.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -769,7 +769,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/detection/linknet.html b/v0.8.0/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.8.0/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.8.0/_modules/doctr/models/detection/linknet/tensorflow.html index c5fd053513..d374bb6d1e 100644 --- a/v0.8.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -698,7 +698,7 @@

Source code for doctr.models.detection.linknet.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/detection/zoo.html b/v0.8.0/_modules/doctr/models/detection/zoo.html index 00783fcab6..dd9da6dc8e 100644 --- a/v0.8.0/_modules/doctr/models/detection/zoo.html +++ b/v0.8.0/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -412,7 +412,7 @@

Source code for doctr.models.detection.zoo

     
   
- + diff --git a/v0.8.0/_modules/doctr/models/export.html b/v0.8.0/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.8.0/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/models/factory/hub.html b/v0.8.0/_modules/doctr/models/factory/hub.html index a49f4ebde7..93aa0aa8f3 100644 --- a/v0.8.0/_modules/doctr/models/factory/hub.html +++ b/v0.8.0/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -572,7 +572,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.8.0/_modules/doctr/models/recognition/crnn.html b/v0.8.0/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.8.0/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.8.0/_modules/doctr/models/recognition/crnn/tensorflow.html index a00647e1b2..a8a19605ba 100644 --- a/v0.8.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -650,7 +650,7 @@

Source code for doctr.models.recognition.crnn.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.8.0/_modules/doctr/models/recognition/master/tensorflow.html index 446786da5f..fa02c4de73 100644 --- a/v0.8.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -644,7 +644,7 @@

Source code for doctr.models.recognition.master.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.8.0/_modules/doctr/models/recognition/parseq/tensorflow.html index bd56053be1..d06bbd51e6 100644 --- a/v0.8.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -837,7 +837,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/recognition/sar.html b/v0.8.0/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.8.0/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.8.0/_modules/doctr/models/recognition/sar/tensorflow.html index 6a44c6d2f4..9bbcdfbf81 100644 --- a/v0.8.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -753,7 +753,7 @@

Source code for doctr.models.recognition.sar.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.8.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 1a97114efa..7131ac4a5b 100644 --- a/v0.8.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -610,7 +610,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/models/recognition/zoo.html b/v0.8.0/_modules/doctr/models/recognition/zoo.html index 4c61c0e058..b6896dd45c 100644 --- a/v0.8.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.8.0/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -401,7 +401,7 @@

Source code for doctr.models.recognition.zoo

   
- + diff --git a/v0.8.0/_modules/doctr/models/zoo.html b/v0.8.0/_modules/doctr/models/zoo.html index 7d5510b773..a964ff6aff 100644 --- a/v0.8.0/_modules/doctr/models/zoo.html +++ b/v0.8.0/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -570,7 +570,7 @@

Source code for doctr.models.zoo

     
   
- + diff --git a/v0.8.0/_modules/doctr/transforms/modules.html b/v0.8.0/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.8.0/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/transforms/modules/base.html b/v0.8.0/_modules/doctr/transforms/modules/base.html index 42e8b8d2b1..087636ae0d 100644 --- a/v0.8.0/_modules/doctr/transforms/modules/base.html +++ b/v0.8.0/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -615,7 +615,7 @@

Source code for doctr.transforms.modules.base

- + diff --git a/v0.8.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.8.0/_modules/doctr/transforms/modules/tensorflow.html index 5e85447d5c..9ef65dafc0 100644 --- a/v0.8.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.8.0/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -888,7 +888,7 @@

Source code for doctr.transforms.modules.tensorflow

- + diff --git a/v0.8.0/_modules/doctr/utils/metrics.html b/v0.8.0/_modules/doctr/utils/metrics.html index 5190fb3dd2..bec0aee3f4 100644 --- a/v0.8.0/_modules/doctr/utils/metrics.html +++ b/v0.8.0/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -1071,7 +1071,7 @@

Source code for doctr.utils.metrics

     
   
- + diff --git a/v0.8.0/_modules/doctr/utils/visualization.html b/v0.8.0/_modules/doctr/utils/visualization.html index 9094dda132..d7c33dc75a 100644 --- a/v0.8.0/_modules/doctr/utils/visualization.html +++ b/v0.8.0/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -830,7 +830,7 @@

Source code for doctr.utils.visualization

     
   
- + diff --git a/v0.8.0/_modules/index.html b/v0.8.0/_modules/index.html index 4f958d8657..ce2cfeee25 100644 --- a/v0.8.0/_modules/index.html +++ b/v0.8.0/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -371,7 +371,7 @@

All modules for which code is available

- + diff --git a/v0.8.0/_sources/datasets.rst.txt b/v0.8.0/_sources/datasets.rst.txt deleted file mode 100644 index 354122f1e5..0000000000 --- a/v0.8.0/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.datasets.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.8.0/_sources/documents.rst.txt b/v0.8.0/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.8.0/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.8.0/_sources/installing.rst.txt b/v0.8.0/_sources/installing.rst.txt deleted file mode 100644 index 5c8779dc1c..0000000000 --- a/v0.8.0/_sources/installing.rst.txt +++ /dev/null @@ -1,46 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: - -* TensorFlow: `installation page `_. -* PyTorch: `installation page `_. - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.8.0/_sources/models.rst.txt b/v0.8.0/_sources/models.rst.txt deleted file mode 100644 index 9830c6c153..0000000000 --- a/v0.8.0/_sources/models.rst.txt +++ /dev/null @@ -1,215 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet16 - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 -.. autofunction:: doctr.models.recognition.master - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.8.0/_sources/transforms.rst.txt b/v0.8.0/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.8.0/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.8.0/_sources/utils.rst.txt b/v0.8.0/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.8.0/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.8.0/_static/basic.css b/v0.8.0/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.8.0/_static/basic.css +++ b/v0.8.0/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.8.0/_static/doctools.js b/v0.8.0/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.8.0/_static/doctools.js +++ b/v0.8.0/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.8.0/_static/language_data.js b/v0.8.0/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.8.0/_static/language_data.js +++ b/v0.8.0/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.8.0/_static/searchtools.js b/v0.8.0/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.8.0/_static/searchtools.js +++ b/v0.8.0/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.8.0/changelog.html b/v0.8.0/changelog.html index 8285a5e380..9c80410651 100644 --- a/v0.8.0/changelog.html +++ b/v0.8.0/changelog.html @@ -14,7 +14,7 @@ - + Changelog - docTR documentation @@ -420,7 +420,7 @@

v0.1.0 (2021-03-05) - + diff --git a/v0.8.0/community/resources.html b/v0.8.0/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.8.0/community/resources.html +++ b/v0.8.0/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.8.0/contributing/code_of_conduct.html b/v0.8.0/contributing/code_of_conduct.html index b27650ed66..0c4d9db2bb 100644 --- a/v0.8.0/contributing/code_of_conduct.html +++ b/v0.8.0/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -498,7 +498,7 @@

Attribution - + diff --git a/v0.8.0/contributing/contributing.html b/v0.8.0/contributing/contributing.html index a878d4a467..12c9ec5fd1 100644 --- a/v0.8.0/contributing/contributing.html +++ b/v0.8.0/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -475,7 +475,7 @@

Let’s connect - + diff --git a/v0.8.0/datasets.html b/v0.8.0/datasets.html deleted file mode 100644 index 193e576c57..0000000000 --- a/v0.8.0/datasets.html +++ /dev/null @@ -1,578 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
  • sos – optional encoding of Start Of String

  • -
  • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/documents.html b/v0.8.0/documents.html deleted file mode 100644 index 98cbb2c5ef..0000000000 --- a/v0.8.0/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/genindex.html b/v0.8.0/genindex.html index 60b8645d9f..4e4a97a0d4 100644 --- a/v0.8.0/genindex.html +++ b/v0.8.0/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -740,7 +740,7 @@

W

- + diff --git a/v0.8.0/getting_started/installing.html b/v0.8.0/getting_started/installing.html index 3d28b70e68..ee89a6da08 100644 --- a/v0.8.0/getting_started/installing.html +++ b/v0.8.0/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -422,7 +422,7 @@

Via Git - + diff --git a/v0.8.0/index.html b/v0.8.0/index.html index 0c21ab0888..b2b248cd9c 100644 --- a/v0.8.0/index.html +++ b/v0.8.0/index.html @@ -14,7 +14,7 @@ - + docTR documentation @@ -436,7 +436,7 @@

Supported datasets - + diff --git a/v0.8.0/installing.html b/v0.8.0/installing.html deleted file mode 100644 index b61c60134b..0000000000 --- a/v0.8.0/installing.html +++ /dev/null @@ -1,395 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

- -

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/models.html b/v0.8.0/models.html deleted file mode 100644 index b5cd44c9fa..0000000000 --- a/v0.8.0/models.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet16
->>> model = linknet16(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
-

MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. -Example:

-
>>> import tensorflow as tf
->>> from doctr.models import master
->>> model = master(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

59.50

62.50

75.30

70.00

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

66.00

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/modules/contrib.html b/v0.8.0/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.8.0/modules/contrib.html +++ b/v0.8.0/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.8.0/modules/datasets.html b/v0.8.0/modules/datasets.html index 8594fcc430..ff7215e911 100644 --- a/v0.8.0/modules/datasets.html +++ b/v0.8.0/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1049,7 +1049,7 @@

Returns: - + diff --git a/v0.8.0/modules/io.html b/v0.8.0/modules/io.html index 7e2cc4acdf..5042fa9a8f 100644 --- a/v0.8.0/modules/io.html +++ b/v0.8.0/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -752,7 +752,7 @@

Returns: - + diff --git a/v0.8.0/modules/models.html b/v0.8.0/modules/models.html index 5583787c99..218467341e 100644 --- a/v0.8.0/modules/models.html +++ b/v0.8.0/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1459,7 +1459,7 @@

Args: - + diff --git a/v0.8.0/modules/transforms.html b/v0.8.0/modules/transforms.html index 07708fb0fa..867b50df22 100644 --- a/v0.8.0/modules/transforms.html +++ b/v0.8.0/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -804,7 +804,7 @@

Args:< - + diff --git a/v0.8.0/modules/utils.html b/v0.8.0/modules/utils.html index a9cb4fd561..08fb937d9e 100644 --- a/v0.8.0/modules/utils.html +++ b/v0.8.0/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -737,7 +737,7 @@

Args:< - + diff --git a/v0.8.0/notebooks.html b/v0.8.0/notebooks.html index 2548d1b73c..2c16094e3b 100644 --- a/v0.8.0/notebooks.html +++ b/v0.8.0/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -379,7 +379,7 @@

docTR Notebooks - + diff --git a/v0.8.0/py-modindex.html b/v0.8.0/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.8.0/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.8.0/search.html b/v0.8.0/search.html index f4af66c58d..2d08f1f54b 100644 --- a/v0.8.0/search.html +++ b/v0.8.0/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -334,7 +334,7 @@ - + diff --git a/v0.8.0/searchindex.js b/v0.8.0/searchindex.js index aed6427a1d..989c1b03d6 100644 --- a/v0.8.0/searchindex.js +++ b/v0.8.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Advanced options": [[16, "advanced-options"]], "Args:": [[5, "args"], [5, "id4"], [5, "id7"], [5, "id10"], [5, "id13"], [5, "id16"], [5, "id19"], [5, "id22"], [5, "id25"], [5, "id29"], [5, "id32"], [5, "id37"], [5, "id40"], [5, "id46"], [5, "id49"], [5, "id50"], [5, "id51"], [5, "id54"], [5, "id57"], [5, "id60"], [5, "id61"], [6, "args"], [6, "id2"], [6, "id3"], [6, "id4"], [6, "id5"], [6, "id6"], [6, "id7"], [6, "id10"], [6, "id12"], [6, "id14"], [6, "id16"], [6, "id20"], [6, "id24"], [6, "id28"], [7, "args"], [7, "id3"], [7, "id8"], [7, "id13"], [7, "id17"], [7, "id21"], [7, "id26"], [7, "id31"], [7, "id36"], [7, "id41"], [7, "id45"], [7, "id49"], [7, "id54"], [7, "id58"], [7, "id63"], [7, "id68"], [7, "id72"], [7, "id76"], [7, "id81"], [7, "id86"], [7, "id90"], [7, "id95"], [7, "id99"], [7, "id103"], [7, "id108"], [7, "id113"], [7, "id118"], [7, "id122"], [7, "id126"], [7, "id131"], [7, "id135"], [7, "id139"], [7, "id143"], [7, "id145"], [7, "id147"], [7, "id149"], [8, "args"], [8, "id1"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id8"], [8, "id9"], [8, "id10"], [8, "id11"], [8, "id12"], [8, "id13"], [8, "id14"], [8, "id15"], [8, "id16"], [8, "id17"], [8, "id18"], [9, "args"], [9, "id3"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Returns:": [[5, "returns"], [6, "returns"], [6, "id11"], [6, "id13"], [6, "id15"], [6, "id19"], [6, "id23"], [6, "id27"], [6, "id31"], [7, "returns"], [7, "id6"], [7, "id11"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id29"], [7, "id34"], [7, "id39"], [7, "id44"], [7, "id48"], [7, "id52"], [7, "id57"], [7, "id61"], [7, "id66"], [7, "id71"], [7, "id75"], [7, "id79"], [7, "id84"], [7, "id89"], [7, "id93"], [7, "id98"], [7, "id102"], [7, "id106"], [7, "id111"], [7, "id116"], [7, "id121"], [7, "id125"], [7, "id129"], [7, "id134"], [7, "id138"], [7, "id142"], [7, "id144"], [7, "id146"], [7, "id148"], [9, "returns"], [9, "id4"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2024-09-09)": [[0, "v0-7-0-2024-09-09"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[5, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WILDRECEIPT"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "textnet_base"], [7, 1, 1, "", "textnet_small"], [7, 1, 1, "", "textnet_tiny"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": [], "02562": 7, "03": 16, "035": 16, "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": 5, "106": 5, "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": 5, "115": [], "1156": 14, "116": 5, "118": 5, "11800h": 16, "11th": 16, "12": [3, 16], "120": 5, "123": 5, "126": 5, "1268": 14, "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": 5, "149": 14, "15": 16, "150": [9, 16], "154": [], "1552": 16, "16": [7, 15, 16], "160": [], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": 7, "185546875": 16, "19": [], "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": 5, "19598": 14, "199": 16, "1999": 16, "1m": [], "2": [3, 4, 5, 6, 8, 16], "20": 16, "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2021": [], "2023": [], "207901": 14, "21": 16, "2103": 5, "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": 16, "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": 7, "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": [8, 16], "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": [], "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": 16, "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": 14, "48": [5, 16], "485": 8, "49": 16, "49377": 14, "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "533": [], "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "595": [], "597": 16, "5k": [4, 5], "5m": [], "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "611": [], "62": 16, "625": [], "626": 14, "629": [], "63": 16, "630": [], "64": [7, 8, 16], "640": [], "641": 16, "647": 14, "65": 16, "66": 16, "660": [], "664": [], "666": [], "67": 16, "672": [], "68": 16, "689": [], "69": 16, "693": 11, "694": 11, "695": 11, "6m": [], "7": 16, "70": [9, 16], "700": [], "701": [], "702": [], "707470": 14, "71": 16, "7100000": 14, "713": [], "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "733": [], "74": 16, "745": [], "75": [8, 16], "753": [], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "780": [], "781": [], "783": [], "785": 11, "789": [], "79": 16, "793533": 14, "796": 14, "798": 11, "7m": [], "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "817": [], "82": 16, "8275l": [], "83": 16, "830": [], "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "860": [], "8603515625": 16, "862": [], "863": [], "87": 16, "8707": 14, "875": [], "88": 16, "89": 16, "8m": [], "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "913": [], "914085328578949": 16, "917": [], "92": 16, "921": [], "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": [], "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "And": [], "As": 2, "Be": 16, "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": 7, "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": 16, "_build": 2, "_i": 9, "ab": 5, "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accent": [], "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": 16, "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13, 16], "add_hook": 16, "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": 7, "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "andrej": [], "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": 7, "arch": [7, 13], "architectur": [4, 7, 13], "archiv": [], "area": 16, "arg": [], "argument": [5, 6, 7, 9, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "articl": [], "artifici": [4, 5], "arxiv": [5, 7], "as_imag": [], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autoclass": [], "autom": 4, "automat": 16, "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": 16, "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "baranovskij": [], "base": [4, 7], "baselin": [4, 7, 16], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": [], "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": [1, 16], "being": [9, 16], "belong": 16, "benchmark": 16, "best": 1, "beta": [], "better": [10, 16], "between": [8, 9, 16], "bgr": 6, "bilinear": 8, "bin_thresh": 16, "binar": [4, 7, 16], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": 16, "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [6, 9], "c5": [], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "changelog": [], "channel": [1, 2, 6, 8], "channel_prior": [], "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "christian": [], "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 7, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": 7, "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "conv_sequ": [], "convers": 6, "convert": [6, 8], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 7, "cool": [], "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_resnet31": [], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": [13, 16], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": 16, "cvit": 4, "czczup": 7, "czech": 5, "d": [5, 14], "daili": [], "danish": 5, "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": [], "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": 16, "default": [6, 9, 11, 12, 16], "defer": 14, "defin": [9, 15], "deform": [], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": [11, 16], "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": 7, "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "developp": [], "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12, 16], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disk": [], "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distanc": [], "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": 16, "documentfil": [6, 13], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [4, 5], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": 5, "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easier": [], "easili": [6, 9, 11, 13, 14, 16], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enough": [2, 16], "ensur": 2, "entir": [], "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "error": [], "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exactmatch": [], "exampl": [1, 2, 4, 5, 7, 13, 16], "exchang": 15, "exclud": [], "execut": 16, "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extract_arch": [], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": 13, "fast": [4, 5, 7], "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [7, 15], "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feed": [], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15, 16], "figsiz": 9, "figur": 9, "file": [2, 5], "file_hash": [], "file_nam": [], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": 5, "first": [2, 5], "firsthand": 5, "fit": [7, 16], "fitz": [], "flag": 16, "flexibl": [], "flip": 8, "float": [6, 8, 9, 15], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": [], "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "found": [], "fp": [], "fp16": 15, "frac": 9, "frame": [], "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "fulli": [], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gallagh": [], "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11, 13], "get": 16, "get_artefact": [], "get_word": [], "gettextword": [], "git": 13, "github": [2, 3, 7, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": [4, 5, 6], "grayscal": 8, "ground": 9, "groung": 9, "group": [4, 16], "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "half": [], "handl": [14, 16], "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": 5, "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5, 16], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": 16, "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 6, 16], "http": [1, 3, 5, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ibrahimov": [], "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 7, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": 7, "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inform": [1, 2, 4, 5, 14], "inherit": [], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_t": [], "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "invoic": [], "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "issu": [1, 2, 13], "italian": 5, "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "jame": [], "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "kei": [4, 5], "kera": [7, 15], "kernel": [7, 8], "kernel_s": [], "kernel_shap": 8, "keywoard": 7, "keyword": [5, 6, 7, 9], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": 1, "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": [5, 16], "less": [15, 16], "let": [], "letter": [], "level": [1, 5, 9, 16], "levenshtein": [], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 7, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet16": [], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": 3, "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loader": [], "loc_pr": 16, "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6, 16], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "love": 13, "lower": [8, 9, 16], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": 16, "map": [5, 7], "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": [6, 9], "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_dist": [], "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": 7, "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [6, 8, 16], "metric": [9, 16], "middl": 16, "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": 7, "minimum": [3, 5, 8, 9, 16], "minval": 8, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "modal": [4, 5], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12, 16], "modul": [6, 7, 8, 9, 16], "moment": [], "more": [2, 9, 14, 16], "moscardi": [], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [5, 13], "multipl": [5, 6, 8, 16], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": 16, "n": [5, 9], "na": [], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": [], "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13, 16], "neg": 8, "nest": 16, "nestedobject": [], "netraj": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "newer": [], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": 5, "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": [1, 7], "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "opinion": 1, "optic": [4, 16], "optim": 4, "option": [5, 7, 11], "order": [2, 5, 6, 8], "org": [1, 5, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": 16, "paragraph_break": 16, "parallel": [], "param": [8, 16], "paramet": [4, 6, 7, 15], "pars": [4, 5], "parseq": [4, 7, 13, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": [7, 9], "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "patil": [], "pattern": 1, "pdf": [6, 7, 10], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 7, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": 16, "pixbuf": 3, "pixel": [6, 8, 16], "platinum": [], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": 5, "polit": 1, "polygon": [5, 9, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13, 16], "post": [1, 16], "postprocessor": 16, "potenti": 7, "power": 4, "ppageno": 16, "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9, 16], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "properti": [], "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pypdfium2": 6, "pyplot": [6, 9], "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": 13, "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": [], "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "realli": [], "reason": [1, 4, 5], "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "recurr": [], "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": 16, "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9, 16], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "relu": [], "remov": 1, "render": [6, 16], "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": 16, "resolve_lin": 16, "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "return": 16, "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "roboflow": [], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "rotated_bbox": [], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sanjin": [], "sar": [4, 7], "sar_resnet31": [7, 16], "sar_vgg16_bn": [], "satur": 8, "save": [7, 14], "saved_model": [], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "scheme": [], "score": 9, "scratch": [], "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seemlessli": [], "seen": 16, "segment": [4, 7, 16], "self": 16, "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": [8, 16], "seri": 1, "serial": [], "serialized_model": [], "seriou": 1, "set": [1, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "sha256": [], "shade": 8, "shape": [6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": [1, 16], "span": 16, "spanish": 5, "spatial": [4, 5, 6, 9], "special": [], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standalon": [], "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": 7, "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": [], "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": 5, "symbol": [], "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "task": [4, 5, 7, 13, 14, 16], "task2": 5, "team": [], "techminde": [], "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": [5, 14], "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": 16, "textmatch": 9, "textnet": 7, "textnet_bas": 7, "textnet_smal": 7, "textnet_tini": 7, "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "tf_model": [], "tflite": [], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": 16, "through": [1, 8, 14], "tilman": 13, "time": [1, 4, 7, 9, 14], "tini": 7, "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "turn": [], "two": [6, 12], "txt": 5, "type": [6, 9, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": [14, 16], "underneath": 6, "understand": [4, 5, 16], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unit": [], "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v0": [], "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "veri": 7, "verifi": [], "verma": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn_r": 7, "via": 1, "video": [], "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warm": [], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": 6, "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14, 16], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [4, 5, 14], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x12larg": [], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xeon": [], "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "yugesh": [], "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 5, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": 5, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 5, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 5, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 5, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": [], "2024": 0, "21": [], "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": [], "9": [], "advanc": 16, "approach": 16, "architectur": 16, "arg": [5, 6, 7, 8, 9], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "ban": 1, "block": 6, "bug": 2, "build": [], "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "compress": [], "conda": [], "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "from": 13, "gener": [5, 14], "get": [], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "implement": [], "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "linux": [], "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "note": [], "notebook": 10, "object": 14, "ocr": 16, "onli": [], "onnx": 15, "optim": 15, "option": 16, "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "post": [], "pre": [], "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "process": [], "push": 13, "python": 3, "qualiti": 2, "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "refer": [], "report": 2, "request": 2, "resourc": [], "respons": 1, "return": [5, 6, 7, 9], "right": 16, "savedmodel": [], "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "start": [], "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Advanced options": [[16, "advanced-options"]], "Args:": [[5, "args"], [5, "id4"], [5, "id7"], [5, "id10"], [5, "id13"], [5, "id16"], [5, "id19"], [5, "id22"], [5, "id25"], [5, "id29"], [5, "id32"], [5, "id37"], [5, "id40"], [5, "id46"], [5, "id49"], [5, "id50"], [5, "id51"], [5, "id54"], [5, "id57"], [5, "id60"], [5, "id61"], [6, "args"], [6, "id2"], [6, "id3"], [6, "id4"], [6, "id5"], [6, "id6"], [6, "id7"], [6, "id10"], [6, "id12"], [6, "id14"], [6, "id16"], [6, "id20"], [6, "id24"], [6, "id28"], [7, "args"], [7, "id3"], [7, "id8"], [7, "id13"], [7, "id17"], [7, "id21"], [7, "id26"], [7, "id31"], [7, "id36"], [7, "id41"], [7, "id45"], [7, "id49"], [7, "id54"], [7, "id58"], [7, "id63"], [7, "id68"], [7, "id72"], [7, "id76"], [7, "id81"], [7, "id86"], [7, "id90"], [7, "id95"], [7, "id99"], [7, "id103"], [7, "id108"], [7, "id113"], [7, "id118"], [7, "id122"], [7, "id126"], [7, "id131"], [7, "id135"], [7, "id139"], [7, "id143"], [7, "id145"], [7, "id147"], [7, "id149"], [8, "args"], [8, "id1"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id8"], [8, "id9"], [8, "id10"], [8, "id11"], [8, "id12"], [8, "id13"], [8, "id14"], [8, "id15"], [8, "id16"], [8, "id17"], [8, "id18"], [9, "args"], [9, "id3"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Returns:": [[5, "returns"], [6, "returns"], [6, "id11"], [6, "id13"], [6, "id15"], [6, "id19"], [6, "id23"], [6, "id27"], [6, "id31"], [7, "returns"], [7, "id6"], [7, "id11"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id29"], [7, "id34"], [7, "id39"], [7, "id44"], [7, "id48"], [7, "id52"], [7, "id57"], [7, "id61"], [7, "id66"], [7, "id71"], [7, "id75"], [7, "id79"], [7, "id84"], [7, "id89"], [7, "id93"], [7, "id98"], [7, "id102"], [7, "id106"], [7, "id111"], [7, "id116"], [7, "id121"], [7, "id125"], [7, "id129"], [7, "id134"], [7, "id138"], [7, "id142"], [7, "id144"], [7, "id146"], [7, "id148"], [9, "returns"], [9, "id4"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2024-09-09)": [[0, "v0-7-0-2024-09-09"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[5, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WILDRECEIPT"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "textnet_base"], [7, 1, 1, "", "textnet_small"], [7, 1, 1, "", "textnet_tiny"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": [], "02562": 7, "03": 16, "035": 16, "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": 5, "106": 5, "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": 5, "115": [], "1156": 14, "116": 5, "118": 5, "11800h": 16, "11th": 16, "12": [3, 16], "120": 5, "123": 5, "126": 5, "1268": 14, "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": 5, "149": 14, "15": 16, "150": [9, 16], "1552": 16, "16": [7, 15, 16], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": 7, "185546875": 16, "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": 5, "19598": 14, "199": 16, "1999": 16, "2": [3, 4, 5, 6, 8, 16], "20": 16, "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2023": [], "207901": 14, "21": 16, "2103": 5, "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": 16, "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": 7, "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": [8, 16], "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": [], "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": 16, "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": 14, "48": [5, 16], "485": 8, "49": 16, "49377": 14, "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "597": 16, "5k": [4, 5], "5m": [], "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "62": 16, "626": 14, "63": 16, "64": [7, 8, 16], "641": 16, "647": 14, "65": 16, "66": 16, "67": 16, "68": 16, "69": 16, "693": 11, "694": 11, "695": 11, "6m": [], "7": 16, "70": [9, 16], "707470": 14, "71": 16, "7100000": 14, "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "74": 16, "75": [8, 16], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "785": 11, "79": 16, "793533": 14, "796": 14, "798": 11, "7m": [], "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "82": 16, "83": 16, "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "8603515625": 16, "87": 16, "8707": 14, "88": 16, "89": 16, "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "914085328578949": 16, "92": 16, "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": [], "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "As": 2, "Be": 16, "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": 7, "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": 16, "_build": 2, "_i": 9, "ab": 5, "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": 16, "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13, 16], "add_hook": 16, "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": 7, "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "andrej": [], "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": 7, "arch": [7, 13], "architectur": [4, 7, 13], "area": 16, "arg": [], "argument": [5, 6, 7, 9, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "articl": [], "artifici": [4, 5], "arxiv": [5, 7], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autom": 4, "automat": 16, "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": 16, "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "baranovskij": [], "base": [4, 7], "baselin": [4, 7, 16], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": [], "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": [1, 16], "being": [9, 16], "belong": 16, "benchmark": 16, "best": 1, "better": [10, 16], "between": [8, 9, 16], "bgr": 6, "bilinear": 8, "bin_thresh": 16, "binar": [4, 7, 16], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": 16, "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [6, 9], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "channel": [1, 2, 6, 8], "channel_prior": [], "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "christian": [], "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 7, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": 7, "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "convers": 6, "convert": [6, 8], "convolut": 7, "cool": [], "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": [13, 16], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": 16, "cvit": 4, "czczup": 7, "czech": 5, "d": [5, 14], "danish": 5, "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": 16, "default": [6, 9, 11, 12, 16], "defer": 14, "defin": [9, 15], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": [11, 16], "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": 7, "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12, 16], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": 16, "documentfil": [6, 13], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [4, 5], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": 5, "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easili": [6, 9, 11, 13, 14, 16], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enough": [2, 16], "ensur": 2, "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exampl": [1, 2, 4, 5, 7, 13, 16], "exchang": 15, "execut": 16, "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": 13, "fast": [4, 5, 7], "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [7, 15], "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15, 16], "figsiz": 9, "figur": 9, "file": [2, 5], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": 5, "first": [2, 5], "firsthand": 5, "fit": [7, 16], "flag": 16, "flip": 8, "float": [6, 8, 9, 15], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": [], "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "found": [], "fp16": 15, "frac": 9, "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gallagh": [], "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11, 13], "get": 16, "git": 13, "github": [2, 3, 7, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": [4, 5, 6], "grayscal": 8, "ground": 9, "groung": 9, "group": [4, 16], "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "handl": [14, 16], "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": 5, "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5, 16], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": 16, "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 6, 16], "http": [1, 3, 5, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ibrahimov": [], "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 7, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": 7, "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inform": [1, 2, 4, 5, 14], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "issu": [1, 2, 13], "italian": 5, "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "jame": [], "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "kei": [4, 5], "kera": [7, 15], "kernel": [7, 8], "kernel_shap": 8, "keywoard": 7, "keyword": [5, 6, 7, 9], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": 1, "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": [5, 16], "less": [15, 16], "level": [1, 5, 9, 16], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 7, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": 3, "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loc_pr": 16, "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6, 16], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "love": 13, "lower": [8, 9, 16], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": 16, "map": [5, 7], "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": [6, 9], "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": 7, "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [6, 8, 16], "metric": [9, 16], "middl": 16, "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": 7, "minimum": [3, 5, 8, 9, 16], "minval": 8, "miss": 3, "mistak": 1, "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "modal": [4, 5], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12, 16], "modul": [6, 7, 8, 9, 16], "moment": [], "more": [2, 9, 14, 16], "moscardi": [], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [5, 13], "multipl": [5, 6, 8, 16], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": 16, "n": [5, 9], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": [], "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13, 16], "neg": 8, "nest": 16, "netraj": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": 5, "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": [1, 7], "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "opinion": 1, "optic": [4, 16], "optim": 4, "option": [5, 7, 11], "order": [2, 5, 6, 8], "org": [1, 5, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": 16, "paragraph_break": 16, "parallel": [], "param": [8, 16], "paramet": [4, 6, 7, 15], "pars": [4, 5], "parseq": [4, 7, 13, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": [7, 9], "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "patil": [], "pattern": 1, "pdf": [6, 7, 10], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 7, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": 16, "pixbuf": 3, "pixel": [6, 8, 16], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": 5, "polit": 1, "polygon": [5, 9, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13, 16], "post": [1, 16], "postprocessor": 16, "potenti": 7, "power": 4, "ppageno": 16, "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9, 16], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pypdfium2": 6, "pyplot": [6, 9], "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": 13, "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": [], "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "realli": [], "reason": [1, 4, 5], "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": 16, "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9, 16], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "remov": 1, "render": [6, 16], "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": 16, "resolve_lin": 16, "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "return": 16, "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "roboflow": [], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sanjin": [], "sar": [4, 7], "sar_resnet31": [7, 16], "satur": 8, "save": [7, 14], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "score": 9, "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seen": 16, "segment": [4, 7, 16], "self": 16, "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": [8, 16], "seri": 1, "seriou": 1, "set": [1, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "shade": 8, "shape": [6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": [1, 16], "span": 16, "spanish": 5, "spatial": [4, 5, 6, 9], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standalon": [], "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": 7, "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": [], "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": 5, "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "task": [4, 5, 7, 13, 14, 16], "task2": 5, "team": [], "techminde": [], "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": [5, 14], "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": 16, "textmatch": 9, "textnet": 7, "textnet_bas": 7, "textnet_smal": 7, "textnet_tini": 7, "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": 16, "through": [1, 8, 14], "tilman": 13, "time": [1, 4, 7, 9, 14], "tini": 7, "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "two": [6, 12], "txt": 5, "type": [6, 9, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": [14, 16], "underneath": 6, "understand": [4, 5, 16], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unit": [], "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "veri": 7, "verma": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn_r": 7, "via": 1, "video": [], "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": 6, "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14, 16], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [4, 5, 14], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "yugesh": [], "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 5, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": 5, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 5, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 5, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 5, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": [], "2024": 0, "21": [], "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": [], "9": [], "advanc": 16, "approach": 16, "architectur": 16, "arg": [5, 6, 7, 8, 9], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "ban": 1, "block": 6, "bug": 2, "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "conda": [], "conduct": 1, "connect": 2, "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "from": 13, "gener": [5, 14], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "linux": [], "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "notebook": 10, "object": 14, "ocr": 16, "onli": [], "onnx": 15, "optim": 15, "option": 16, "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "push": 13, "python": 3, "qualiti": 2, "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "report": 2, "request": 2, "resourc": [], "respons": 1, "return": [5, 6, 7, 9], "right": 16, "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file diff --git a/v0.8.0/transforms.html b/v0.8.0/transforms.html deleted file mode 100644 index 85e94d8a76..0000000000 --- a/v0.8.0/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.5)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[Callable[[Any], Any]])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[Callable[[Any], Any]])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: Callable[[Any], Any], p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.0/using_doctr/custom_models_training.html b/v0.8.0/using_doctr/custom_models_training.html index 9d5e2b2767..cae5a30ecd 100644 --- a/v0.8.0/using_doctr/custom_models_training.html +++ b/v0.8.0/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -545,7 +545,7 @@

Loading your custom trained model - + diff --git a/v0.8.0/using_doctr/running_on_aws.html b/v0.8.0/using_doctr/running_on_aws.html index 87855efb54..86eb396241 100644 --- a/v0.8.0/using_doctr/running_on_aws.html +++ b/v0.8.0/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -356,7 +356,7 @@

AWS Lambda - + diff --git a/v0.8.0/using_doctr/sharing_models.html b/v0.8.0/using_doctr/sharing_models.html index 27592853b0..94bfbf9a72 100644 --- a/v0.8.0/using_doctr/sharing_models.html +++ b/v0.8.0/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -538,7 +538,7 @@

Recognition - + diff --git a/v0.8.0/using_doctr/using_contrib_modules.html b/v0.8.0/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.8.0/using_doctr/using_contrib_modules.html +++ b/v0.8.0/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.8.0/using_doctr/using_datasets.html b/v0.8.0/using_doctr/using_datasets.html index 99aca264ac..51ce87dd4d 100644 --- a/v0.8.0/using_doctr/using_datasets.html +++ b/v0.8.0/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -623,7 +623,7 @@

Data Loading - + diff --git a/v0.8.0/using_doctr/using_model_export.html b/v0.8.0/using_doctr/using_model_export.html index e0554e6718..0983118f82 100644 --- a/v0.8.0/using_doctr/using_model_export.html +++ b/v0.8.0/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -436,7 +436,7 @@

Using your ONNX exported model in docTR - + diff --git a/v0.8.0/using_doctr/using_models.html b/v0.8.0/using_doctr/using_models.html index e7751545e8..7f81e649f0 100644 --- a/v0.8.0/using_doctr/using_models.html +++ b/v0.8.0/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1135,7 +1135,7 @@

Advanced options - + diff --git a/v0.8.0/utils.html b/v0.8.0/utils.html deleted file mode 100644 index e2f223f06a..0000000000 --- a/v0.8.0/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float | None, float | None, float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/datasets/cord.html b/v0.8.1/_modules/doctr/datasets/cord.html index 354f0062c2..85f1a47a08 100644 --- a/v0.8.1/_modules/doctr/datasets/cord.html +++ b/v0.8.1/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.datasets.cord

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/core.html b/v0.8.1/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.8.1/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/datasets/datasets/tensorflow.html b/v0.8.1/_modules/doctr/datasets/datasets/tensorflow.html deleted file mode 100644 index a236abd9fe..0000000000 --- a/v0.8.1/_modules/doctr/datasets/datasets/tensorflow.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - - - - - doctr.datasets.datasets.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.datasets.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from typing import List, Any, Tuple
-import tensorflow as tf
-
-from .base import _AbstractDataset, _VisionDataset
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset(_AbstractDataset):
-
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-
-        return img, target
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset, _VisionDataset): - pass
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/datasets/detection.html b/v0.8.1/_modules/doctr/datasets/detection.html index faf9256c89..706b89a562 100644 --- a/v0.8.1/_modules/doctr/datasets/detection.html +++ b/v0.8.1/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -424,7 +424,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/doc_artefacts.html b/v0.8.1/_modules/doctr/datasets/doc_artefacts.html index 886999868b..dc8e8f9c29 100644 --- a/v0.8.1/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.8.1/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -408,7 +408,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.8.1/_modules/doctr/datasets/funsd.html b/v0.8.1/_modules/doctr/datasets/funsd.html index 60f7e51592..6f7ab121f0 100644 --- a/v0.8.1/_modules/doctr/datasets/funsd.html +++ b/v0.8.1/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.funsd

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/generator/tensorflow.html b/v0.8.1/_modules/doctr/datasets/generator/tensorflow.html index fecf8b2d82..814dc0822d 100644 --- a/v0.8.1/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.8.1/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -389,7 +389,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/datasets/ic03.html b/v0.8.1/_modules/doctr/datasets/ic03.html index 83f7bcddf0..cf8999d751 100644 --- a/v0.8.1/_modules/doctr/datasets/ic03.html +++ b/v0.8.1/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -452,7 +452,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/ic13.html b/v0.8.1/_modules/doctr/datasets/ic13.html index 1d92d10349..7650af381c 100644 --- a/v0.8.1/_modules/doctr/datasets/ic13.html +++ b/v0.8.1/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -425,7 +425,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/iiit5k.html b/v0.8.1/_modules/doctr/datasets/iiit5k.html index 14ab1db716..b4a54e7e22 100644 --- a/v0.8.1/_modules/doctr/datasets/iiit5k.html +++ b/v0.8.1/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -429,7 +429,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/iiithws.html b/v0.8.1/_modules/doctr/datasets/iiithws.html index e7c0d4e8dd..052a85cd56 100644 --- a/v0.8.1/_modules/doctr/datasets/iiithws.html +++ b/v0.8.1/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -401,7 +401,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/imgur5k.html b/v0.8.1/_modules/doctr/datasets/imgur5k.html index eb12e48784..f6c1a4692c 100644 --- a/v0.8.1/_modules/doctr/datasets/imgur5k.html +++ b/v0.8.1/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -473,7 +473,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/loader.html b/v0.8.1/_modules/doctr/datasets/loader.html index cdaec1bb70..9b2b3126de 100644 --- a/v0.8.1/_modules/doctr/datasets/loader.html +++ b/v0.8.1/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -428,7 +428,7 @@

Source code for doctr.datasets.loader

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/mjsynth.html b/v0.8.1/_modules/doctr/datasets/mjsynth.html index d7a7e66e35..c95f99e6d5 100644 --- a/v0.8.1/_modules/doctr/datasets/mjsynth.html +++ b/v0.8.1/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -432,7 +432,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/ocr.html b/v0.8.1/_modules/doctr/datasets/ocr.html index c6e09faee3..a1a249b259 100644 --- a/v0.8.1/_modules/doctr/datasets/ocr.html +++ b/v0.8.1/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -397,7 +397,7 @@

Source code for doctr.datasets.ocr

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/recognition.html b/v0.8.1/_modules/doctr/datasets/recognition.html index 1e14da06a9..95612cdadb 100644 --- a/v0.8.1/_modules/doctr/datasets/recognition.html +++ b/v0.8.1/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -382,7 +382,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/sroie.html b/v0.8.1/_modules/doctr/datasets/sroie.html index f3ac7b9547..32b4b17983 100644 --- a/v0.8.1/_modules/doctr/datasets/sroie.html +++ b/v0.8.1/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -429,7 +429,7 @@

Source code for doctr.datasets.sroie

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/svhn.html b/v0.8.1/_modules/doctr/datasets/svhn.html index f10a8cfd8e..5633dcfd6c 100644 --- a/v0.8.1/_modules/doctr/datasets/svhn.html +++ b/v0.8.1/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -457,7 +457,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/svt.html b/v0.8.1/_modules/doctr/datasets/svt.html index 0d64efedf4..0ed4482c50 100644 --- a/v0.8.1/_modules/doctr/datasets/svt.html +++ b/v0.8.1/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -443,7 +443,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/synthtext.html b/v0.8.1/_modules/doctr/datasets/synthtext.html index 333de06da8..edd5c63c80 100644 --- a/v0.8.1/_modules/doctr/datasets/synthtext.html +++ b/v0.8.1/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/utils.html b/v0.8.1/_modules/doctr/datasets/utils.html index 6e90a6400d..eeee0b2654 100644 --- a/v0.8.1/_modules/doctr/datasets/utils.html +++ b/v0.8.1/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -542,7 +542,7 @@

Source code for doctr.datasets.utils

     
   
- + diff --git a/v0.8.1/_modules/doctr/datasets/wildreceipt.html b/v0.8.1/_modules/doctr/datasets/wildreceipt.html index 2b386ae694..6b5a52a10e 100644 --- a/v0.8.1/_modules/doctr/datasets/wildreceipt.html +++ b/v0.8.1/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -437,7 +437,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.8.1/_modules/doctr/documents/elements.html b/v0.8.1/_modules/doctr/documents/elements.html deleted file mode 100644 index 10c1e142d2..0000000000 --- a/v0.8.1/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,577 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox, RotatedBbox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator, arg-type] - - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/documents/reader.html b/v0.8.1/_modules/doctr/documents/reader.html deleted file mode 100644 index cdcd814b6c..0000000000 --- a/v0.8.1/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,612 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence, Dict
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/io/elements.html b/v0.8.1/_modules/doctr/io/elements.html index 78ea4cc7cf..a8d52c457f 100644 --- a/v0.8.1/_modules/doctr/io/elements.html +++ b/v0.8.1/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -960,7 +960,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.8.1/_modules/doctr/io/html.html b/v0.8.1/_modules/doctr/io/html.html index a1eb075da0..34a60da286 100644 --- a/v0.8.1/_modules/doctr/io/html.html +++ b/v0.8.1/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -354,7 +354,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.8.1/_modules/doctr/io/image/base.html b/v0.8.1/_modules/doctr/io/image/base.html index 1b42de0506..54663fa868 100644 --- a/v0.8.1/_modules/doctr/io/image/base.html +++ b/v0.8.1/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -382,7 +382,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.8.1/_modules/doctr/io/image/tensorflow.html b/v0.8.1/_modules/doctr/io/image/tensorflow.html index 02325e0630..cf030207d4 100644 --- a/v0.8.1/_modules/doctr/io/image/tensorflow.html +++ b/v0.8.1/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -439,7 +439,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.8.1/_modules/doctr/io/pdf.html b/v0.8.1/_modules/doctr/io/pdf.html index 7d82b6573c..7dcb3e2381 100644 --- a/v0.8.1/_modules/doctr/io/pdf.html +++ b/v0.8.1/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -368,7 +368,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.8.1/_modules/doctr/io/reader.html b/v0.8.1/_modules/doctr/io/reader.html index 5a8c87d168..5568ce7e0f 100644 --- a/v0.8.1/_modules/doctr/io/reader.html +++ b/v0.8.1/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -414,7 +414,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.8.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.8.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 1b97d83911..4dd332b464 100644 --- a/v0.8.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -518,7 +518,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.8.1/_modules/doctr/models/classification/mobilenet/tensorflow.html index b583e184fa..7dbc971810 100644 --- a/v0.8.1/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -747,7 +747,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.8.1/_modules/doctr/models/classification/resnet/tensorflow.html index 67c7ede371..77a5747d8b 100644 --- a/v0.8.1/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -730,7 +730,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.8.1/_modules/doctr/models/classification/textnet/tensorflow.html index a36ebab4f6..45bcea9658 100644 --- a/v0.8.1/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -599,7 +599,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.8.1/_modules/doctr/models/classification/vgg/tensorflow.html index 57e34af78f..8dc381674b 100644 --- a/v0.8.1/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -439,7 +439,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/classification/vit/tensorflow.html b/v0.8.1/_modules/doctr/models/classification/vit/tensorflow.html index 717a6d1649..84d68b5388 100644 --- a/v0.8.1/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -521,7 +521,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/classification/zoo.html b/v0.8.1/_modules/doctr/models/classification/zoo.html index 87f2d2956d..d1f749776e 100644 --- a/v0.8.1/_modules/doctr/models/classification/zoo.html +++ b/v0.8.1/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -400,7 +400,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.8.1/_modules/doctr/models/detection/differentiable_binarization.html b/v0.8.1/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.8.1/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.8.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index b3523b2fb5..5cf3b58dbb 100644 --- a/v0.8.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -731,7 +731,7 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo

- + diff --git a/v0.8.1/_modules/doctr/models/detection/fast/tensorflow.html b/v0.8.1/_modules/doctr/models/detection/fast/tensorflow.html index 73eeecd71f..c383007826 100644 --- a/v0.8.1/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -760,7 +760,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/detection/linknet.html b/v0.8.1/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.8.1/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.8.1/_modules/doctr/models/detection/linknet/tensorflow.html index c5fd053513..d374bb6d1e 100644 --- a/v0.8.1/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -698,7 +698,7 @@

Source code for doctr.models.detection.linknet.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/detection/zoo.html b/v0.8.1/_modules/doctr/models/detection/zoo.html index ce4a60785f..43326bb2a1 100644 --- a/v0.8.1/_modules/doctr/models/detection/zoo.html +++ b/v0.8.1/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -424,7 +424,7 @@

Source code for doctr.models.detection.zoo

     
   
- + diff --git a/v0.8.1/_modules/doctr/models/export.html b/v0.8.1/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.8.1/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/models/factory/hub.html b/v0.8.1/_modules/doctr/models/factory/hub.html index a49f4ebde7..93aa0aa8f3 100644 --- a/v0.8.1/_modules/doctr/models/factory/hub.html +++ b/v0.8.1/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -572,7 +572,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.8.1/_modules/doctr/models/recognition/crnn.html b/v0.8.1/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.8.1/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.8.1/_modules/doctr/models/recognition/crnn/tensorflow.html index a00647e1b2..a8a19605ba 100644 --- a/v0.8.1/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -650,7 +650,7 @@

Source code for doctr.models.recognition.crnn.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/recognition/master/tensorflow.html b/v0.8.1/_modules/doctr/models/recognition/master/tensorflow.html index 446786da5f..fa02c4de73 100644 --- a/v0.8.1/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -644,7 +644,7 @@

Source code for doctr.models.recognition.master.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.8.1/_modules/doctr/models/recognition/parseq/tensorflow.html index bd56053be1..d06bbd51e6 100644 --- a/v0.8.1/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -837,7 +837,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/recognition/sar.html b/v0.8.1/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.8.1/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.8.1/_modules/doctr/models/recognition/sar/tensorflow.html index 6a44c6d2f4..9bbcdfbf81 100644 --- a/v0.8.1/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -753,7 +753,7 @@

Source code for doctr.models.recognition.sar.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.8.1/_modules/doctr/models/recognition/vitstr/tensorflow.html index 1a97114efa..7131ac4a5b 100644 --- a/v0.8.1/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.8.1/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -610,7 +610,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/models/recognition/zoo.html b/v0.8.1/_modules/doctr/models/recognition/zoo.html index 4c61c0e058..b6896dd45c 100644 --- a/v0.8.1/_modules/doctr/models/recognition/zoo.html +++ b/v0.8.1/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -401,7 +401,7 @@

Source code for doctr.models.recognition.zoo

   
- + diff --git a/v0.8.1/_modules/doctr/models/zoo.html b/v0.8.1/_modules/doctr/models/zoo.html index 7d5510b773..a964ff6aff 100644 --- a/v0.8.1/_modules/doctr/models/zoo.html +++ b/v0.8.1/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -570,7 +570,7 @@

Source code for doctr.models.zoo

     
   
- + diff --git a/v0.8.1/_modules/doctr/transforms/modules.html b/v0.8.1/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.8.1/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/transforms/modules/base.html b/v0.8.1/_modules/doctr/transforms/modules/base.html index 42e8b8d2b1..087636ae0d 100644 --- a/v0.8.1/_modules/doctr/transforms/modules/base.html +++ b/v0.8.1/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -615,7 +615,7 @@

Source code for doctr.transforms.modules.base

- + diff --git a/v0.8.1/_modules/doctr/transforms/modules/tensorflow.html b/v0.8.1/_modules/doctr/transforms/modules/tensorflow.html index 5e85447d5c..9ef65dafc0 100644 --- a/v0.8.1/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.8.1/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -888,7 +888,7 @@

Source code for doctr.transforms.modules.tensorflow

- + diff --git a/v0.8.1/_modules/doctr/utils/metrics.html b/v0.8.1/_modules/doctr/utils/metrics.html index 5190fb3dd2..bec0aee3f4 100644 --- a/v0.8.1/_modules/doctr/utils/metrics.html +++ b/v0.8.1/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -1071,7 +1071,7 @@

Source code for doctr.utils.metrics

     
   
- + diff --git a/v0.8.1/_modules/doctr/utils/visualization.html b/v0.8.1/_modules/doctr/utils/visualization.html index 9094dda132..d7c33dc75a 100644 --- a/v0.8.1/_modules/doctr/utils/visualization.html +++ b/v0.8.1/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -830,7 +830,7 @@

Source code for doctr.utils.visualization

     
   
- + diff --git a/v0.8.1/_modules/index.html b/v0.8.1/_modules/index.html index db6e0f4507..0c3394d1db 100644 --- a/v0.8.1/_modules/index.html +++ b/v0.8.1/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -372,7 +372,7 @@

All modules for which code is available

- + diff --git a/v0.8.1/_sources/datasets.rst.txt b/v0.8.1/_sources/datasets.rst.txt deleted file mode 100644 index 354122f1e5..0000000000 --- a/v0.8.1/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.datasets.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.8.1/_sources/documents.rst.txt b/v0.8.1/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.8.1/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.8.1/_sources/installing.rst.txt b/v0.8.1/_sources/installing.rst.txt deleted file mode 100644 index 5c8779dc1c..0000000000 --- a/v0.8.1/_sources/installing.rst.txt +++ /dev/null @@ -1,46 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: - -* TensorFlow: `installation page `_. -* PyTorch: `installation page `_. - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.8.1/_sources/models.rst.txt b/v0.8.1/_sources/models.rst.txt deleted file mode 100644 index 9830c6c153..0000000000 --- a/v0.8.1/_sources/models.rst.txt +++ /dev/null @@ -1,215 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet16 - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 -.. autofunction:: doctr.models.recognition.master - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.8.1/_sources/transforms.rst.txt b/v0.8.1/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.8.1/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.8.1/_sources/utils.rst.txt b/v0.8.1/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.8.1/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.8.1/_static/basic.css b/v0.8.1/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.8.1/_static/basic.css +++ b/v0.8.1/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.8.1/_static/doctools.js b/v0.8.1/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.8.1/_static/doctools.js +++ b/v0.8.1/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.8.1/_static/language_data.js b/v0.8.1/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.8.1/_static/language_data.js +++ b/v0.8.1/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.8.1/_static/searchtools.js b/v0.8.1/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.8.1/_static/searchtools.js +++ b/v0.8.1/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.8.1/changelog.html b/v0.8.1/changelog.html index 3f802a9ee4..f805dd7cba 100644 --- a/v0.8.1/changelog.html +++ b/v0.8.1/changelog.html @@ -14,7 +14,7 @@ - + Changelog - docTR documentation @@ -425,7 +425,7 @@

v0.1.0 (2021-03-05) - + diff --git a/v0.8.1/community/resources.html b/v0.8.1/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.8.1/community/resources.html +++ b/v0.8.1/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.8.1/contributing/code_of_conduct.html b/v0.8.1/contributing/code_of_conduct.html index 51e26624aa..c8e56f2887 100644 --- a/v0.8.1/contributing/code_of_conduct.html +++ b/v0.8.1/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -498,7 +498,7 @@

Attribution - + diff --git a/v0.8.1/contributing/contributing.html b/v0.8.1/contributing/contributing.html index 967bbb0d4d..0ed3b44c28 100644 --- a/v0.8.1/contributing/contributing.html +++ b/v0.8.1/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -475,7 +475,7 @@

Let’s connect - + diff --git a/v0.8.1/datasets.html b/v0.8.1/datasets.html deleted file mode 100644 index 193e576c57..0000000000 --- a/v0.8.1/datasets.html +++ /dev/null @@ -1,578 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
  • sos – optional encoding of Start Of String

  • -
  • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/documents.html b/v0.8.1/documents.html deleted file mode 100644 index 98cbb2c5ef..0000000000 --- a/v0.8.1/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/genindex.html b/v0.8.1/genindex.html index ac2652d193..985d8bd94d 100644 --- a/v0.8.1/genindex.html +++ b/v0.8.1/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -746,7 +746,7 @@

W

- + diff --git a/v0.8.1/getting_started/installing.html b/v0.8.1/getting_started/installing.html index a3422df41a..523060e132 100644 --- a/v0.8.1/getting_started/installing.html +++ b/v0.8.1/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -431,7 +431,7 @@

Via Git - + diff --git a/v0.8.1/index.html b/v0.8.1/index.html index cde19babe1..ef6d129db6 100644 --- a/v0.8.1/index.html +++ b/v0.8.1/index.html @@ -14,7 +14,7 @@ - + docTR documentation @@ -437,7 +437,7 @@

Supported datasets - + diff --git a/v0.8.1/installing.html b/v0.8.1/installing.html deleted file mode 100644 index b61c60134b..0000000000 --- a/v0.8.1/installing.html +++ /dev/null @@ -1,395 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

- -

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/models.html b/v0.8.1/models.html deleted file mode 100644 index b5cd44c9fa..0000000000 --- a/v0.8.1/models.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet16
->>> model = linknet16(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
-

MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. -Example:

-
>>> import tensorflow as tf
->>> from doctr.models import master
->>> model = master(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

59.50

62.50

75.30

70.00

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

66.00

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/modules/contrib.html b/v0.8.1/modules/contrib.html index e99f6b3f74..7fb86b8b38 100644 --- a/v0.8.1/modules/contrib.html +++ b/v0.8.1/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -380,7 +380,7 @@

Supported contribution modules - + diff --git a/v0.8.1/modules/datasets.html b/v0.8.1/modules/datasets.html index 618fc1971e..409c9ea616 100644 --- a/v0.8.1/modules/datasets.html +++ b/v0.8.1/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1049,7 +1049,7 @@

Returns: - + diff --git a/v0.8.1/modules/io.html b/v0.8.1/modules/io.html index 6ac8de4585..76c06b4207 100644 --- a/v0.8.1/modules/io.html +++ b/v0.8.1/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -752,7 +752,7 @@

Returns: - + diff --git a/v0.8.1/modules/models.html b/v0.8.1/modules/models.html index 8529b1e8f6..5678645a16 100644 --- a/v0.8.1/modules/models.html +++ b/v0.8.1/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1540,7 +1540,7 @@

Args: - + diff --git a/v0.8.1/modules/transforms.html b/v0.8.1/modules/transforms.html index e4eb8ee1fa..cf547592b1 100644 --- a/v0.8.1/modules/transforms.html +++ b/v0.8.1/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -804,7 +804,7 @@

Args:< - + diff --git a/v0.8.1/modules/utils.html b/v0.8.1/modules/utils.html index 06c83105f5..adf1487e5b 100644 --- a/v0.8.1/modules/utils.html +++ b/v0.8.1/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -737,7 +737,7 @@

Args:< - + diff --git a/v0.8.1/notebooks.html b/v0.8.1/notebooks.html index 7af9bc55b4..5b76c79ab2 100644 --- a/v0.8.1/notebooks.html +++ b/v0.8.1/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -379,7 +379,7 @@

docTR Notebooks - + diff --git a/v0.8.1/py-modindex.html b/v0.8.1/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.8.1/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.8.1/search.html b/v0.8.1/search.html index 2fe88c7d93..1dd2b30845 100644 --- a/v0.8.1/search.html +++ b/v0.8.1/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -334,7 +334,7 @@ - + diff --git a/v0.8.1/searchindex.js b/v0.8.1/searchindex.js index 4eb7904e61..c1e9808759 100644 --- a/v0.8.1/searchindex.js +++ b/v0.8.1/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Advanced options": [[16, "advanced-options"]], "Args:": [[5, "args"], [5, "id4"], [5, "id7"], [5, "id10"], [5, "id13"], [5, "id16"], [5, "id19"], [5, "id22"], [5, "id25"], [5, "id29"], [5, "id32"], [5, "id37"], [5, "id40"], [5, "id46"], [5, "id49"], [5, "id50"], [5, "id51"], [5, "id54"], [5, "id57"], [5, "id60"], [5, "id61"], [6, "args"], [6, "id2"], [6, "id3"], [6, "id4"], [6, "id5"], [6, "id6"], [6, "id7"], [6, "id10"], [6, "id12"], [6, "id14"], [6, "id16"], [6, "id20"], [6, "id24"], [6, "id28"], [7, "args"], [7, "id3"], [7, "id8"], [7, "id13"], [7, "id17"], [7, "id21"], [7, "id26"], [7, "id31"], [7, "id36"], [7, "id41"], [7, "id45"], [7, "id49"], [7, "id54"], [7, "id58"], [7, "id63"], [7, "id68"], [7, "id72"], [7, "id76"], [7, "id81"], [7, "id86"], [7, "id90"], [7, "id95"], [7, "id100"], [7, "id105"], [7, "id110"], [7, "id114"], [7, "id118"], [7, "id123"], [7, "id128"], [7, "id133"], [7, "id137"], [7, "id141"], [7, "id146"], [7, "id150"], [7, "id154"], [7, "id158"], [7, "id160"], [7, "id162"], [7, "id164"], [8, "args"], [8, "id1"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id8"], [8, "id9"], [8, "id10"], [8, "id11"], [8, "id12"], [8, "id13"], [8, "id14"], [8, "id15"], [8, "id16"], [8, "id17"], [8, "id18"], [9, "args"], [9, "id3"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Returns:": [[5, "returns"], [6, "returns"], [6, "id11"], [6, "id13"], [6, "id15"], [6, "id19"], [6, "id23"], [6, "id27"], [6, "id31"], [7, "returns"], [7, "id6"], [7, "id11"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id29"], [7, "id34"], [7, "id39"], [7, "id44"], [7, "id48"], [7, "id52"], [7, "id57"], [7, "id61"], [7, "id66"], [7, "id71"], [7, "id75"], [7, "id79"], [7, "id84"], [7, "id89"], [7, "id93"], [7, "id98"], [7, "id103"], [7, "id108"], [7, "id113"], [7, "id117"], [7, "id121"], [7, "id126"], [7, "id131"], [7, "id136"], [7, "id140"], [7, "id144"], [7, "id149"], [7, "id153"], [7, "id157"], [7, "id159"], [7, "id161"], [7, "id163"], [9, "returns"], [9, "id4"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Conda (Only for Linux)": [[3, "via-conda-only-for-linux"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[5, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WILDRECEIPT"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "textnet_base"], [7, 1, 1, "", "textnet_small"], [7, 1, 1, "", "textnet_tiny"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "fast_base"], [7, 1, 1, "", "fast_small"], [7, 1, 1, "", "fast_tiny"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": [], "02562": 7, "03": 16, "035": 16, "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": 5, "106": 5, "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": 5, "115": [], "1156": 14, "116": 5, "118": 5, "11800h": 16, "11th": 16, "12": [3, 16], "120": 5, "123": 5, "126": 5, "1268": 14, "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": 5, "149": 14, "15": 16, "150": [9, 16], "154": [], "1552": 16, "16": [7, 15, 16], "160": [], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": 7, "185546875": 16, "19": [], "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": 5, "19598": 14, "199": 16, "1999": 16, "1m": [], "2": [3, 4, 5, 6, 8, 16], "20": 16, "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2021": [], "2023": [], "207901": 14, "21": 16, "2103": 5, "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": 16, "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": 7, "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": [8, 16], "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": [], "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": 16, "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": 14, "48": [5, 16], "485": 8, "49": 16, "49377": 14, "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "533": [], "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "595": [], "597": 16, "5k": [4, 5], "5m": 16, "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "611": [], "62": 16, "625": [], "626": 14, "629": [], "63": 16, "630": [], "64": [7, 8, 16], "640": [], "641": 16, "647": 14, "65": 16, "66": 16, "660": [], "664": [], "666": [], "67": 16, "672": [], "68": 16, "689": [], "69": 16, "693": 11, "694": 11, "695": 11, "6m": 16, "7": 16, "70": [9, 16], "700": [], "701": [], "702": [], "707470": 14, "71": 16, "7100000": 14, "713": [], "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "733": [], "74": 16, "745": [], "75": [8, 16], "753": [], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "780": [], "781": [], "783": [], "785": 11, "789": [], "79": 16, "793533": 14, "796": 14, "798": 11, "7m": 16, "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "817": [], "82": 16, "8275l": [], "83": 16, "830": [], "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "860": [], "8603515625": 16, "862": [], "863": [], "87": 16, "8707": 14, "875": [], "88": 16, "89": 16, "8m": [], "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "913": [], "914085328578949": 16, "917": [], "92": 16, "921": [], "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": [], "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "And": [], "As": 2, "Be": 16, "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": 7, "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": 16, "_build": 2, "_i": 9, "ab": 5, "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accent": [], "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": 16, "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13, 16], "add_hook": 16, "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": 7, "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "andrej": [], "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": [4, 7], "arch": [7, 13], "architectur": [4, 7, 13], "archiv": [], "area": 16, "arg": [], "argument": [5, 6, 7, 9, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "articl": [], "artifici": [4, 5], "arxiv": [5, 7], "as_imag": [], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autoclass": [], "autom": 4, "automat": 16, "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": 16, "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "baranovskij": [], "base": [4, 7], "baselin": [4, 7, 16], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": 3, "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": [1, 16], "being": [9, 16], "belong": 16, "benchmark": 16, "best": 1, "beta": [], "better": [10, 16], "between": [8, 9, 16], "bgr": 6, "bilinear": 8, "bin_thresh": 16, "binar": [4, 7, 16], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": 16, "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [3, 6, 9], "c5": [], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "changelog": [], "channel": [1, 2, 6, 8], "channel_prior": 3, "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "christian": [], "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 7, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": [3, 7], "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "conv_sequ": [], "convers": 6, "convert": [6, 8], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 7, "cool": [], "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_resnet31": [], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": [13, 16], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": 16, "cvit": 4, "czczup": 7, "czech": 5, "d": [5, 14], "daili": [], "danish": 5, "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": [], "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": 16, "default": [3, 6, 9, 11, 12, 16], "defer": 14, "defin": [9, 15], "deform": [], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": [11, 16], "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": [4, 7], "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "developp": [], "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12, 16], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disk": [], "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distanc": [], "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": 16, "documentfil": [6, 13], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [4, 5], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": 5, "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easier": [], "easili": [6, 9, 11, 13, 14, 16], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enough": [2, 16], "ensur": 2, "entir": [], "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "error": [], "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exactmatch": [], "exampl": [1, 2, 4, 5, 7, 13, 16], "exchang": 15, "exclud": [], "execut": 16, "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extract_arch": [], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": 13, "fast": [4, 5, 7], "fast_bas": [7, 16], "fast_smal": [7, 16], "fast_tini": [7, 16], "faster": [4, 7, 15], "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feed": [], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15, 16], "figsiz": 9, "figur": 9, "file": [2, 5], "file_hash": [], "file_nam": [], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": 5, "first": [2, 5], "firsthand": 5, "fit": [7, 16], "fitz": [], "flag": 16, "flexibl": [], "flip": 8, "float": [6, 8, 9, 15], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": 3, "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "found": [], "fp": [], "fp16": 15, "frac": 9, "frame": [], "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "fulli": [], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gallagh": [], "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11, 13], "get": 16, "get_artefact": [], "get_word": [], "gettextword": [], "git": 13, "github": [2, 3, 7, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": [4, 5, 6], "grayscal": 8, "ground": 9, "groung": 9, "group": [4, 16], "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "half": [], "handl": [14, 16], "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": 5, "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5, 16], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": 16, "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 6, 16], "http": [1, 3, 5, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ibrahimov": [], "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 7, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": 7, "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inform": [1, 2, 4, 5, 14], "inherit": [], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_t": [], "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "invoic": [], "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "issu": [1, 2, 13], "italian": 5, "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "jame": [], "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "kei": [4, 5], "kera": [7, 15], "kernel": [4, 7, 8], "kernel_s": [], "kernel_shap": 8, "keywoard": 7, "keyword": [5, 6, 7, 9], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": 1, "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": [5, 16], "less": [15, 16], "let": [], "letter": [], "level": [1, 5, 9, 16], "levenshtein": [], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 7, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet16": [], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": [], "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loader": [], "loc_pr": 16, "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6, 16], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "love": 13, "lower": [8, 9, 16], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": 16, "map": [5, 7], "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": [6, 9], "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_dist": [], "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": 7, "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [6, 8, 16], "metric": [9, 16], "middl": 16, "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": [4, 7], "minimum": [3, 5, 8, 9, 16], "minval": 8, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "modal": [4, 5], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12, 16], "modul": [6, 7, 8, 9, 16], "moment": [], "more": [2, 9, 14, 16], "moscardi": [], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [5, 13], "multipl": [5, 6, 8, 16], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": 16, "n": [5, 9], "na": [], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": [], "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13, 16], "neg": 8, "nest": 16, "nestedobject": [], "netraj": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "newer": [], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": 5, "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": [1, 7], "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "opinion": 1, "optic": [4, 16], "optim": 4, "option": [5, 7, 11], "order": [2, 5, 6, 8], "org": [1, 5, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": 16, "paragraph_break": 16, "parallel": [], "param": [8, 16], "paramet": [4, 6, 7, 15], "pars": [4, 5], "parseq": [4, 7, 13, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": [7, 9], "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "patil": [], "pattern": 1, "pdf": [6, 7, 10], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 7, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": 16, "pixbuf": 3, "pixel": [6, 8, 16], "platinum": [], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": 5, "polit": 1, "polygon": [5, 9, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13, 16], "post": [1, 16], "postprocessor": 16, "potenti": 7, "power": 4, "ppageno": 16, "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9, 16], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "properti": [], "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pypdfium2": [3, 6], "pyplot": [6, 9], "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": 13, "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": [], "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "realli": [], "reason": [1, 4, 5], "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "recurr": [], "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": 16, "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9, 16], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "relu": [], "remov": 1, "render": [6, 16], "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": 16, "resolve_lin": 16, "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "return": 16, "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "roboflow": [], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "rotated_bbox": [], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sanjin": [], "sar": [4, 7], "sar_resnet31": [7, 16], "sar_vgg16_bn": [], "satur": 8, "save": [7, 14], "saved_model": [], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "scheme": [], "score": 9, "scratch": [], "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seemlessli": [], "seen": 16, "segment": [4, 7, 16], "self": 16, "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": [8, 16], "seri": 1, "serial": [], "serialized_model": [], "seriou": 1, "set": [1, 3, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "sha256": [], "shade": 8, "shape": [4, 6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": [1, 16], "span": 16, "spanish": 5, "spatial": [4, 5, 6, 9], "special": [], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standalon": [], "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": 7, "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": 3, "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": 5, "symbol": [], "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "task": [4, 5, 7, 13, 14, 16], "task2": 5, "team": 3, "techminde": 3, "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": [5, 14], "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": 16, "textmatch": 9, "textnet": 7, "textnet_bas": 7, "textnet_smal": 7, "textnet_tini": 7, "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "tf_model": [], "tflite": [], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": 16, "through": [1, 8, 14], "tilman": 13, "time": [1, 4, 7, 9, 14], "tini": 7, "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "turn": [], "two": [6, 12], "txt": 5, "type": [6, 9, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": [14, 16], "underneath": 6, "understand": [4, 5, 16], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unit": [], "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v0": [], "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "veri": 7, "verifi": [], "verma": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn_r": 7, "via": 1, "video": [], "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warm": [], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": 6, "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14, 16], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [4, 5, 14], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x12larg": [], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xeon": [], "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "yugesh": [], "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 5, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": 5, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 5, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 5, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 5, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": [], "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": 0, "9": [], "advanc": 16, "approach": 16, "architectur": 16, "arg": [5, 6, 7, 8, 9], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "ban": 1, "block": 6, "bug": 2, "build": [], "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "compress": [], "conda": 3, "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "from": 13, "gener": [5, 14], "get": [], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "implement": [], "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "linux": 3, "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "note": [], "notebook": 10, "object": 14, "ocr": 16, "onli": 3, "onnx": 15, "optim": 15, "option": 16, "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "post": [], "pre": [], "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "process": [], "push": 13, "python": 3, "qualiti": 2, "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "refer": [], "report": 2, "request": 2, "resourc": [], "respons": 1, "return": [5, 6, 7, 9], "right": 16, "savedmodel": [], "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "start": [], "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Advanced options": [[16, "advanced-options"]], "Args:": [[5, "args"], [5, "id4"], [5, "id7"], [5, "id10"], [5, "id13"], [5, "id16"], [5, "id19"], [5, "id22"], [5, "id25"], [5, "id29"], [5, "id32"], [5, "id37"], [5, "id40"], [5, "id46"], [5, "id49"], [5, "id50"], [5, "id51"], [5, "id54"], [5, "id57"], [5, "id60"], [5, "id61"], [6, "args"], [6, "id2"], [6, "id3"], [6, "id4"], [6, "id5"], [6, "id6"], [6, "id7"], [6, "id10"], [6, "id12"], [6, "id14"], [6, "id16"], [6, "id20"], [6, "id24"], [6, "id28"], [7, "args"], [7, "id3"], [7, "id8"], [7, "id13"], [7, "id17"], [7, "id21"], [7, "id26"], [7, "id31"], [7, "id36"], [7, "id41"], [7, "id45"], [7, "id49"], [7, "id54"], [7, "id58"], [7, "id63"], [7, "id68"], [7, "id72"], [7, "id76"], [7, "id81"], [7, "id86"], [7, "id90"], [7, "id95"], [7, "id100"], [7, "id105"], [7, "id110"], [7, "id114"], [7, "id118"], [7, "id123"], [7, "id128"], [7, "id133"], [7, "id137"], [7, "id141"], [7, "id146"], [7, "id150"], [7, "id154"], [7, "id158"], [7, "id160"], [7, "id162"], [7, "id164"], [8, "args"], [8, "id1"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id8"], [8, "id9"], [8, "id10"], [8, "id11"], [8, "id12"], [8, "id13"], [8, "id14"], [8, "id15"], [8, "id16"], [8, "id17"], [8, "id18"], [9, "args"], [9, "id3"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Returns:": [[5, "returns"], [6, "returns"], [6, "id11"], [6, "id13"], [6, "id15"], [6, "id19"], [6, "id23"], [6, "id27"], [6, "id31"], [7, "returns"], [7, "id6"], [7, "id11"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id29"], [7, "id34"], [7, "id39"], [7, "id44"], [7, "id48"], [7, "id52"], [7, "id57"], [7, "id61"], [7, "id66"], [7, "id71"], [7, "id75"], [7, "id79"], [7, "id84"], [7, "id89"], [7, "id93"], [7, "id98"], [7, "id103"], [7, "id108"], [7, "id113"], [7, "id117"], [7, "id121"], [7, "id126"], [7, "id131"], [7, "id136"], [7, "id140"], [7, "id144"], [7, "id149"], [7, "id153"], [7, "id157"], [7, "id159"], [7, "id161"], [7, "id163"], [9, "returns"], [9, "id4"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Conda (Only for Linux)": [[3, "via-conda-only-for-linux"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[5, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WILDRECEIPT"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "textnet_base"], [7, 1, 1, "", "textnet_small"], [7, 1, 1, "", "textnet_tiny"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "fast_base"], [7, 1, 1, "", "fast_small"], [7, 1, 1, "", "fast_tiny"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": [], "02562": 7, "03": 16, "035": 16, "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": 5, "106": 5, "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": 5, "115": [], "1156": 14, "116": 5, "118": 5, "11800h": 16, "11th": 16, "12": [3, 16], "120": 5, "123": 5, "126": 5, "1268": 14, "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": 5, "149": 14, "15": 16, "150": [9, 16], "1552": 16, "16": [7, 15, 16], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": 7, "185546875": 16, "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": 5, "19598": 14, "199": 16, "1999": 16, "2": [3, 4, 5, 6, 8, 16], "20": 16, "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2023": [], "207901": 14, "21": 16, "2103": 5, "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": 16, "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": 7, "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": [8, 16], "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": [], "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": 16, "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": 14, "48": [5, 16], "485": 8, "49": 16, "49377": 14, "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "597": 16, "5k": [4, 5], "5m": 16, "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "62": 16, "626": 14, "63": 16, "64": [7, 8, 16], "641": 16, "647": 14, "65": 16, "66": 16, "67": 16, "68": 16, "69": 16, "693": 11, "694": 11, "695": 11, "6m": 16, "7": 16, "70": [9, 16], "707470": 14, "71": 16, "7100000": 14, "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "74": 16, "75": [8, 16], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "785": 11, "79": 16, "793533": 14, "796": 14, "798": 11, "7m": 16, "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "82": 16, "83": 16, "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "8603515625": 16, "87": 16, "8707": 14, "88": 16, "89": 16, "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "914085328578949": 16, "92": 16, "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": [], "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "As": 2, "Be": 16, "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": 7, "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": 16, "_build": 2, "_i": 9, "ab": 5, "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": 16, "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13, 16], "add_hook": 16, "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": 7, "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "andrej": [], "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": [4, 7], "arch": [7, 13], "architectur": [4, 7, 13], "area": 16, "arg": [], "argument": [5, 6, 7, 9, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "articl": [], "artifici": [4, 5], "arxiv": [5, 7], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autom": 4, "automat": 16, "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": 16, "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "baranovskij": [], "base": [4, 7], "baselin": [4, 7, 16], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": 3, "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": [1, 16], "being": [9, 16], "belong": 16, "benchmark": 16, "best": 1, "better": [10, 16], "between": [8, 9, 16], "bgr": 6, "bilinear": 8, "bin_thresh": 16, "binar": [4, 7, 16], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": 16, "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [3, 6, 9], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "channel": [1, 2, 6, 8], "channel_prior": 3, "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "christian": [], "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 7, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": [3, 7], "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "convers": 6, "convert": [6, 8], "convolut": 7, "cool": [], "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": [13, 16], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": 16, "cvit": 4, "czczup": 7, "czech": 5, "d": [5, 14], "danish": 5, "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": 16, "default": [3, 6, 9, 11, 12, 16], "defer": 14, "defin": [9, 15], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": [11, 16], "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": [4, 7], "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12, 16], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": 16, "documentfil": [6, 13], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [4, 5], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": 5, "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easili": [6, 9, 11, 13, 14, 16], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enough": [2, 16], "ensur": 2, "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exampl": [1, 2, 4, 5, 7, 13, 16], "exchang": 15, "execut": 16, "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": 13, "fast": [4, 5, 7], "fast_bas": [7, 16], "fast_smal": [7, 16], "fast_tini": [7, 16], "faster": [4, 7, 15], "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15, 16], "figsiz": 9, "figur": 9, "file": [2, 5], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": 5, "first": [2, 5], "firsthand": 5, "fit": [7, 16], "flag": 16, "flip": 8, "float": [6, 8, 9, 15], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": 3, "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "found": [], "fp16": 15, "frac": 9, "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gallagh": [], "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11, 13], "get": 16, "git": 13, "github": [2, 3, 7, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": [4, 5, 6], "grayscal": 8, "ground": 9, "groung": 9, "group": [4, 16], "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "handl": [14, 16], "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": 5, "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5, 16], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": 16, "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 6, 16], "http": [1, 3, 5, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ibrahimov": [], "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 7, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": 7, "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inform": [1, 2, 4, 5, 14], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "issu": [1, 2, 13], "italian": 5, "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "jame": [], "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "kei": [4, 5], "kera": [7, 15], "kernel": [4, 7, 8], "kernel_shap": 8, "keywoard": 7, "keyword": [5, 6, 7, 9], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": 1, "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": [5, 16], "less": [15, 16], "level": [1, 5, 9, 16], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 7, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": [], "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loc_pr": 16, "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6, 16], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "love": 13, "lower": [8, 9, 16], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": 16, "map": [5, 7], "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": [6, 9], "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": 7, "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [6, 8, 16], "metric": [9, 16], "middl": 16, "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": [4, 7], "minimum": [3, 5, 8, 9, 16], "minval": 8, "miss": 3, "mistak": 1, "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "modal": [4, 5], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12, 16], "modul": [6, 7, 8, 9, 16], "moment": [], "more": [2, 9, 14, 16], "moscardi": [], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [5, 13], "multipl": [5, 6, 8, 16], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": 16, "n": [5, 9], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": [], "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13, 16], "neg": 8, "nest": 16, "netraj": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": 5, "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": [1, 7], "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "opinion": 1, "optic": [4, 16], "optim": 4, "option": [5, 7, 11], "order": [2, 5, 6, 8], "org": [1, 5, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": 16, "paragraph_break": 16, "parallel": [], "param": [8, 16], "paramet": [4, 6, 7, 15], "pars": [4, 5], "parseq": [4, 7, 13, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": [7, 9], "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "patil": [], "pattern": 1, "pdf": [6, 7, 10], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 7, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": 16, "pixbuf": 3, "pixel": [6, 8, 16], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": 5, "polit": 1, "polygon": [5, 9, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13, 16], "post": [1, 16], "postprocessor": 16, "potenti": 7, "power": 4, "ppageno": 16, "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9, 16], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pypdfium2": [3, 6], "pyplot": [6, 9], "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": 13, "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": [], "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "realli": [], "reason": [1, 4, 5], "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": 16, "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9, 16], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "remov": 1, "render": [6, 16], "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": 16, "resolve_lin": 16, "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "return": 16, "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "roboflow": [], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sanjin": [], "sar": [4, 7], "sar_resnet31": [7, 16], "satur": 8, "save": [7, 14], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "score": 9, "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seen": 16, "segment": [4, 7, 16], "self": 16, "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": [8, 16], "seri": 1, "seriou": 1, "set": [1, 3, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "shade": 8, "shape": [4, 6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": [1, 16], "span": 16, "spanish": 5, "spatial": [4, 5, 6, 9], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standalon": [], "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": 7, "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": 3, "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": 5, "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "task": [4, 5, 7, 13, 14, 16], "task2": 5, "team": 3, "techminde": 3, "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": [5, 14], "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": 16, "textmatch": 9, "textnet": 7, "textnet_bas": 7, "textnet_smal": 7, "textnet_tini": 7, "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": 16, "through": [1, 8, 14], "tilman": 13, "time": [1, 4, 7, 9, 14], "tini": 7, "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "two": [6, 12], "txt": 5, "type": [6, 9, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": [14, 16], "underneath": 6, "understand": [4, 5, 16], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unit": [], "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "veri": 7, "verma": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn_r": 7, "via": 1, "video": [], "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": 6, "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14, 16], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [4, 5, 14], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "yugesh": [], "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 5, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": 5, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 5, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 5, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 5, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": [], "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": 0, "9": [], "advanc": 16, "approach": 16, "architectur": 16, "arg": [5, 6, 7, 8, 9], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "ban": 1, "block": 6, "bug": 2, "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "conda": 3, "conduct": 1, "connect": 2, "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "from": 13, "gener": [5, 14], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "linux": 3, "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "notebook": 10, "object": 14, "ocr": 16, "onli": 3, "onnx": 15, "optim": 15, "option": 16, "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "push": 13, "python": 3, "qualiti": 2, "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "report": 2, "request": 2, "resourc": [], "respons": 1, "return": [5, 6, 7, 9], "right": 16, "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file diff --git a/v0.8.1/transforms.html b/v0.8.1/transforms.html deleted file mode 100644 index 85e94d8a76..0000000000 --- a/v0.8.1/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.5)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[Callable[[Any], Any]])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[Callable[[Any], Any]])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: Callable[[Any], Any], p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.8.1/using_doctr/custom_models_training.html b/v0.8.1/using_doctr/custom_models_training.html index e7bfa08b73..238645ae9e 100644 --- a/v0.8.1/using_doctr/custom_models_training.html +++ b/v0.8.1/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -545,7 +545,7 @@

Loading your custom trained model - + diff --git a/v0.8.1/using_doctr/running_on_aws.html b/v0.8.1/using_doctr/running_on_aws.html index 012b68298e..a6a1e4e5ba 100644 --- a/v0.8.1/using_doctr/running_on_aws.html +++ b/v0.8.1/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -356,7 +356,7 @@

AWS Lambda - + diff --git a/v0.8.1/using_doctr/sharing_models.html b/v0.8.1/using_doctr/sharing_models.html index 2df93350f0..24408f1601 100644 --- a/v0.8.1/using_doctr/sharing_models.html +++ b/v0.8.1/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -538,7 +538,7 @@

Recognition - + diff --git a/v0.8.1/using_doctr/using_contrib_modules.html b/v0.8.1/using_doctr/using_contrib_modules.html index 50598dae5d..0c5fffdf9f 100644 --- a/v0.8.1/using_doctr/using_contrib_modules.html +++ b/v0.8.1/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -415,7 +415,7 @@

ArtefactDetection - + diff --git a/v0.8.1/using_doctr/using_datasets.html b/v0.8.1/using_doctr/using_datasets.html index 640244db19..dfeb445a8f 100644 --- a/v0.8.1/using_doctr/using_datasets.html +++ b/v0.8.1/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -623,7 +623,7 @@

Data Loading - + diff --git a/v0.8.1/using_doctr/using_model_export.html b/v0.8.1/using_doctr/using_model_export.html index 46cccf92cd..ff37d657a6 100644 --- a/v0.8.1/using_doctr/using_model_export.html +++ b/v0.8.1/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -436,7 +436,7 @@

Using your ONNX exported model in docTR - + diff --git a/v0.8.1/using_doctr/using_models.html b/v0.8.1/using_doctr/using_models.html index aa71c2af71..d2e0000fa9 100644 --- a/v0.8.1/using_doctr/using_models.html +++ b/v0.8.1/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1198,7 +1198,7 @@

Advanced options - + diff --git a/v0.8.1/utils.html b/v0.8.1/utils.html deleted file mode 100644 index e2f223f06a..0000000000 --- a/v0.8.1/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float | None, float | None, float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.9.0/_modules/doctr/datasets/cord.html b/v0.9.0/_modules/doctr/datasets/cord.html index 354f0062c2..85f1a47a08 100644 --- a/v0.9.0/_modules/doctr/datasets/cord.html +++ b/v0.9.0/_modules/doctr/datasets/cord.html @@ -13,7 +13,7 @@ - + doctr.datasets.cord - docTR documentation @@ -447,7 +447,7 @@

Source code for doctr.datasets.cord

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/core.html b/v0.9.0/_modules/doctr/datasets/core.html deleted file mode 100644 index b3dcc29ff9..0000000000 --- a/v0.9.0/_modules/doctr/datasets/core.html +++ /dev/null @@ -1,417 +0,0 @@ - - - - - - - - - - - - doctr.datasets.core - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.core

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from pathlib import Path
-from zipfile import ZipFile
-from typing import List, Any, Optional, Tuple
-import tensorflow as tf
-
-from doctr.models.utils import download_from_url
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset:
-
-    data: List[Any] = []
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(
-        self,
-        index: int
-    ) -> Tuple[tf.Tensor, Any]:
-
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.sample_transforms is not None:
-            img = self.sample_transforms(img)
-
-        return img, target
-
-    def extra_repr(self) -> str:
-        return ""
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset): - """Implements an abstract dataset - - Args: - url: URL of the dataset - file_name: name of the file once downloaded - file_hash: expected SHA256 of the file - extract_archive: whether the downloaded file is an archive to be extracted - download: whether the dataset should be downloaded if not present on disk - overwrite: whether the archive should be re-extracted - """ - - def __init__( - self, - url: str, - file_name: Optional[str] = None, - file_hash: Optional[str] = None, - extract_archive: bool = False, - download: bool = False, - overwrite: bool = False, - ) -> None: - - dataset_cache = os.path.join(os.path.expanduser('~'), '.cache', 'doctr', 'datasets') - - file_name = file_name if isinstance(file_name, str) else os.path.basename(url) - # Download the file if not present - archive_path = os.path.join(dataset_cache, file_name) - - if not os.path.exists(archive_path) and not download: - raise ValueError("the dataset needs to be downloaded first with download=True") - - archive_path = download_from_url(url, file_name, file_hash, cache_subdir='datasets') - - # Extract the archive - if extract_archive: - archive_path = Path(archive_path) - dataset_path = archive_path.parent.joinpath(archive_path.stem) - if not dataset_path.is_dir() or overwrite: - with ZipFile(archive_path, 'r') as f: - f.extractall(path=dataset_path) - - # List images - self._root = dataset_path if extract_archive else archive_path - self.data: List[Any] = []
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.9.0/_modules/doctr/datasets/datasets/tensorflow.html deleted file mode 100644 index a236abd9fe..0000000000 --- a/v0.9.0/_modules/doctr/datasets/datasets/tensorflow.html +++ /dev/null @@ -1,356 +0,0 @@ - - - - - - - - - - - - doctr.datasets.datasets.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.datasets.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import os
-from typing import List, Any, Tuple
-import tensorflow as tf
-
-from .base import _AbstractDataset, _VisionDataset
-
-
-__all__ = ['AbstractDataset', 'VisionDataset']
-
-
-class AbstractDataset(_AbstractDataset):
-
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-
-        return img, target
-
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-
-        return images, list(targets)
-
-
-
-[docs] -class VisionDataset(AbstractDataset, _VisionDataset): - pass
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/_modules/doctr/datasets/detection.html b/v0.9.0/_modules/doctr/datasets/detection.html index faf9256c89..706b89a562 100644 --- a/v0.9.0/_modules/doctr/datasets/detection.html +++ b/v0.9.0/_modules/doctr/datasets/detection.html @@ -13,7 +13,7 @@ - + doctr.datasets.detection - docTR documentation @@ -424,7 +424,7 @@

Source code for doctr.datasets.detection

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/doc_artefacts.html b/v0.9.0/_modules/doctr/datasets/doc_artefacts.html index 886999868b..dc8e8f9c29 100644 --- a/v0.9.0/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.9.0/_modules/doctr/datasets/doc_artefacts.html @@ -13,7 +13,7 @@ - + doctr.datasets.doc_artefacts - docTR documentation @@ -408,7 +408,7 @@

Source code for doctr.datasets.doc_artefacts

   
- + diff --git a/v0.9.0/_modules/doctr/datasets/funsd.html b/v0.9.0/_modules/doctr/datasets/funsd.html index 60f7e51592..6f7ab121f0 100644 --- a/v0.9.0/_modules/doctr/datasets/funsd.html +++ b/v0.9.0/_modules/doctr/datasets/funsd.html @@ -13,7 +13,7 @@ - + doctr.datasets.funsd - docTR documentation @@ -438,7 +438,7 @@

Source code for doctr.datasets.funsd

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/generator/tensorflow.html b/v0.9.0/_modules/doctr/datasets/generator/tensorflow.html index fecf8b2d82..814dc0822d 100644 --- a/v0.9.0/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.9.0/_modules/doctr/datasets/generator/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.datasets.generator.tensorflow - docTR documentation @@ -389,7 +389,7 @@

Source code for doctr.datasets.generator.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/datasets/ic03.html b/v0.9.0/_modules/doctr/datasets/ic03.html index 83f7bcddf0..cf8999d751 100644 --- a/v0.9.0/_modules/doctr/datasets/ic03.html +++ b/v0.9.0/_modules/doctr/datasets/ic03.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic03 - docTR documentation @@ -452,7 +452,7 @@

Source code for doctr.datasets.ic03

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/ic13.html b/v0.9.0/_modules/doctr/datasets/ic13.html index 1d92d10349..7650af381c 100644 --- a/v0.9.0/_modules/doctr/datasets/ic13.html +++ b/v0.9.0/_modules/doctr/datasets/ic13.html @@ -13,7 +13,7 @@ - + doctr.datasets.ic13 - docTR documentation @@ -425,7 +425,7 @@

Source code for doctr.datasets.ic13

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/iiit5k.html b/v0.9.0/_modules/doctr/datasets/iiit5k.html index 14ab1db716..b4a54e7e22 100644 --- a/v0.9.0/_modules/doctr/datasets/iiit5k.html +++ b/v0.9.0/_modules/doctr/datasets/iiit5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiit5k - docTR documentation @@ -429,7 +429,7 @@

Source code for doctr.datasets.iiit5k

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/iiithws.html b/v0.9.0/_modules/doctr/datasets/iiithws.html index e7c0d4e8dd..052a85cd56 100644 --- a/v0.9.0/_modules/doctr/datasets/iiithws.html +++ b/v0.9.0/_modules/doctr/datasets/iiithws.html @@ -13,7 +13,7 @@ - + doctr.datasets.iiithws - docTR documentation @@ -401,7 +401,7 @@

Source code for doctr.datasets.iiithws

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/imgur5k.html b/v0.9.0/_modules/doctr/datasets/imgur5k.html index 202d29445f..24314d9dd1 100644 --- a/v0.9.0/_modules/doctr/datasets/imgur5k.html +++ b/v0.9.0/_modules/doctr/datasets/imgur5k.html @@ -13,7 +13,7 @@ - + doctr.datasets.imgur5k - docTR documentation @@ -475,7 +475,7 @@

Source code for doctr.datasets.imgur5k

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/loader.html b/v0.9.0/_modules/doctr/datasets/loader.html index ddcafcea67..f78b43db97 100644 --- a/v0.9.0/_modules/doctr/datasets/loader.html +++ b/v0.9.0/_modules/doctr/datasets/loader.html @@ -13,7 +13,7 @@ - + doctr.datasets.loader - docTR documentation @@ -425,7 +425,7 @@

Source code for doctr.datasets.loader

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/mjsynth.html b/v0.9.0/_modules/doctr/datasets/mjsynth.html index d7a7e66e35..c95f99e6d5 100644 --- a/v0.9.0/_modules/doctr/datasets/mjsynth.html +++ b/v0.9.0/_modules/doctr/datasets/mjsynth.html @@ -13,7 +13,7 @@ - + doctr.datasets.mjsynth - docTR documentation @@ -432,7 +432,7 @@

Source code for doctr.datasets.mjsynth

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/ocr.html b/v0.9.0/_modules/doctr/datasets/ocr.html index c6e09faee3..a1a249b259 100644 --- a/v0.9.0/_modules/doctr/datasets/ocr.html +++ b/v0.9.0/_modules/doctr/datasets/ocr.html @@ -13,7 +13,7 @@ - + doctr.datasets.ocr - docTR documentation @@ -397,7 +397,7 @@

Source code for doctr.datasets.ocr

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/recognition.html b/v0.9.0/_modules/doctr/datasets/recognition.html index 1e14da06a9..95612cdadb 100644 --- a/v0.9.0/_modules/doctr/datasets/recognition.html +++ b/v0.9.0/_modules/doctr/datasets/recognition.html @@ -13,7 +13,7 @@ - + doctr.datasets.recognition - docTR documentation @@ -382,7 +382,7 @@

Source code for doctr.datasets.recognition

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/sroie.html b/v0.9.0/_modules/doctr/datasets/sroie.html index f3ac7b9547..32b4b17983 100644 --- a/v0.9.0/_modules/doctr/datasets/sroie.html +++ b/v0.9.0/_modules/doctr/datasets/sroie.html @@ -13,7 +13,7 @@ - + doctr.datasets.sroie - docTR documentation @@ -429,7 +429,7 @@

Source code for doctr.datasets.sroie

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/svhn.html b/v0.9.0/_modules/doctr/datasets/svhn.html index f10a8cfd8e..5633dcfd6c 100644 --- a/v0.9.0/_modules/doctr/datasets/svhn.html +++ b/v0.9.0/_modules/doctr/datasets/svhn.html @@ -13,7 +13,7 @@ - + doctr.datasets.svhn - docTR documentation @@ -457,7 +457,7 @@

Source code for doctr.datasets.svhn

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/svt.html b/v0.9.0/_modules/doctr/datasets/svt.html index 0d64efedf4..0ed4482c50 100644 --- a/v0.9.0/_modules/doctr/datasets/svt.html +++ b/v0.9.0/_modules/doctr/datasets/svt.html @@ -13,7 +13,7 @@ - + doctr.datasets.svt - docTR documentation @@ -443,7 +443,7 @@

Source code for doctr.datasets.svt

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/synthtext.html b/v0.9.0/_modules/doctr/datasets/synthtext.html index 333de06da8..edd5c63c80 100644 --- a/v0.9.0/_modules/doctr/datasets/synthtext.html +++ b/v0.9.0/_modules/doctr/datasets/synthtext.html @@ -13,7 +13,7 @@ - + doctr.datasets.synthtext - docTR documentation @@ -454,7 +454,7 @@

Source code for doctr.datasets.synthtext

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/utils.html b/v0.9.0/_modules/doctr/datasets/utils.html index 18a602f09b..f4a2e6a244 100644 --- a/v0.9.0/_modules/doctr/datasets/utils.html +++ b/v0.9.0/_modules/doctr/datasets/utils.html @@ -13,7 +13,7 @@ - + doctr.datasets.utils - docTR documentation @@ -545,7 +545,7 @@

Source code for doctr.datasets.utils

     
   
- + diff --git a/v0.9.0/_modules/doctr/datasets/wildreceipt.html b/v0.9.0/_modules/doctr/datasets/wildreceipt.html index 2b386ae694..6b5a52a10e 100644 --- a/v0.9.0/_modules/doctr/datasets/wildreceipt.html +++ b/v0.9.0/_modules/doctr/datasets/wildreceipt.html @@ -13,7 +13,7 @@ - + doctr.datasets.wildreceipt - docTR documentation @@ -437,7 +437,7 @@

Source code for doctr.datasets.wildreceipt

     
   
- + diff --git a/v0.9.0/_modules/doctr/documents/elements.html b/v0.9.0/_modules/doctr/documents/elements.html deleted file mode 100644 index 10c1e142d2..0000000000 --- a/v0.9.0/_modules/doctr/documents/elements.html +++ /dev/null @@ -1,577 +0,0 @@ - - - - - - - - - - - - doctr.documents.elements - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.elements

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple, Dict, List, Any, Optional, Union
-
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.visualization import visualize_page
-from doctr.utils.common_types import BoundingBox, RotatedBbox
-from doctr.utils.repr import NestedObject
-
-__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
-
-
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-
-    _exported_keys: List[str] = []
-
-    def __init__(self, **kwargs: Any) -> None:
-        self._children_names: List[str] = []
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-            self._children_names.append(k)
-
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-
-        return export_dict
-
-    def render(self) -> str:
-        raise NotImplementedError
-
-
-
-[docs] -class Word(Element): - """Implements a word element - - Args: - value: the text string of the word - confidence: the confidence associated with the text prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size - """ - - _exported_keys: List[str] = ["value", "confidence", "geometry"] - - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: - super().__init__() - self.value = value - self.confidence = confidence - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return self.value - - def extra_repr(self) -> str: - return f"value='{self.value}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Artefact(Element): - """Implements a non-textual element - - Args: - artefact_type: the type of artefact - confidence: the confidence of the type prediction - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. - """ - - _exported_keys: List[str] = ["geometry", "type", "confidence"] - - def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None: - super().__init__() - self.geometry = geometry - self.type = artefact_type - self.confidence = confidence - - def render(self) -> str: - """Renders the full text of the element""" - return f"[{self.type.upper()}]" - - def extra_repr(self) -> str: - return f"type='{self.type}', confidence={self.confidence:.2}"
- - - -
-[docs] -class Line(Element): - """Implements a line element as a collection of words - - Args: - words: list of word elements - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all words in it. - """ - - _exported_keys: List[str] = ["geometry"] - words: List[Word] = [] - - def __init__( - self, - words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] - - super().__init__(words=words) - self.geometry = geometry - - def render(self) -> str: - """Renders the full text of the element""" - return " ".join(w.render() for w in self.words)
- - - -
-[docs] -class Block(Element): - """Implements a block element as a collection of lines and artefacts - - Args: - lines: list of line elements - artefacts: list of artefacts - geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to - the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing - all lines and artefacts in it. - """ - - _exported_keys: List[str] = ["geometry"] - lines: List[Line] = [] - artefacts: List[Artefact] = [] - - def __init__( - self, - lines: List[Line] = [], - artefacts: List[Artefact] = [], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, - ) -> None: - # Resolve the geometry using the smallest enclosing bounding box - if geometry is None: - line_boxes = [word.geometry for line in lines for word in line.words] - artefact_boxes = [artefact.geometry for artefact in artefacts] - box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator, arg-type] - - super().__init__(lines=lines, artefacts=artefacts) - self.geometry = geometry - - def render(self, line_break: str = '\n') -> str: - """Renders the full text of the element""" - return line_break.join(line.render() for line in self.lines)
- - - -
-[docs] -class Page(Element): - """Implements a page element as a collection of blocks - - Args: - blocks: list of block elements - page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) - orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction - language: a dictionary with the language value and confidence of the prediction - """ - - _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - blocks: List[Block] = [] - - def __init__( - self, - blocks: List[Block], - page_idx: int, - dimensions: Tuple[int, int], - orientation: Optional[Dict[str, Any]] = None, - language: Optional[Dict[str, Any]] = None, - ) -> None: - super().__init__(blocks=blocks) - self.page_idx = page_idx - self.dimensions = dimensions - self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) - self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) - - def render(self, block_break: str = '\n\n') -> str: - """Renders the full text of the element""" - return block_break.join(b.render() for b in self.blocks) - - def extra_repr(self) -> str: - return f"dimensions={self.dimensions}" - -
-[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: - """Overlay the result on a given image - - Args: - page: image encoded as a numpy array in uint8 - interactive: whether the display should be interactive - """ - visualize_page(self.export(), page, interactive=interactive) - plt.show(**kwargs)
-
- - - -
-[docs] -class Document(Element): - """Implements a document element as a collection of pages - - Args: - pages: list of page elements - """ - - pages: List[Page] = [] - - def __init__( - self, - pages: List[Page], - ) -> None: - super().__init__(pages=pages) - - def render(self, page_break: str = '\n\n\n\n') -> str: - """Renders the full text of the element""" - return page_break.join(p.render() for p in self.pages) - -
-[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/_modules/doctr/documents/reader.html b/v0.9.0/_modules/doctr/documents/reader.html deleted file mode 100644 index cdcd814b6c..0000000000 --- a/v0.9.0/_modules/doctr/documents/reader.html +++ /dev/null @@ -1,612 +0,0 @@ - - - - - - - - - - - - doctr.documents.reader - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.documents.reader

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from weasyprint import HTML
-from typing import List, Tuple, Optional, Any, Union, Sequence, Dict
-
-__all__ = ['read_pdf', 'read_img', 'read_html', 'DocumentFile', 'PDF']
-
-
-AbstractPath = Union[str, Path]
-AbstractFile = Union[AbstractPath, bytes]
-Bbox = Tuple[float, float, float, float]
-
-
-
-[docs] -def read_img( - file: AbstractFile, - output_size: Optional[Tuple[int, int]] = None, - rgb_output: bool = True, -) -> np.ndarray: - """Read an image file into numpy format - - Example:: - >>> from doctr.documents import read_img - >>> page = read_img("path/to/your/doc.jpg") - - Args: - file: the path to the image file - output_size: the expected output size of each page in format H x W - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - Returns: - the page decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)): - if not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - img = cv2.imread(str(file), cv2.IMREAD_COLOR) - elif isinstance(file, bytes): - file = np.frombuffer(file, np.uint8) - img = cv2.imdecode(file, cv2.IMREAD_COLOR) - else: - raise TypeError("unsupported object type for argument 'file'") - - # Validity check - if img is None: - raise ValueError("unable to read file.") - # Resizing - if isinstance(output_size, tuple): - img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR) - # Switch the channel order - if rgb_output: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - return img
- - - -
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. - - Returns: - the rendered image in numpy format - """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -def read_html(url: str, **kwargs: Any) -> bytes: - """Read a PDF file and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") - - Args: - url: URL of the target web page - Returns: - decoded PDF file as a bytes stream - """ - - return HTML(url, **kwargs).write_pdf()
- - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
- - - -
-[docs] -class DocumentFile: - """Read a document from multiple extensions""" - -
-[docs] - @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: - """Read a PDF file - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") - - Args: - file: the path to the PDF file or a binary stream - Returns: - a PDF document - """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
- - -
-[docs] - @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: - """Interpret a web page as a PDF document - - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") - - Args: - url: the URL of the target web page - Returns: - a PDF document - """ - pdf_stream = read_html(url) - return cls.from_pdf(pdf_stream, **kwargs)
- - -
-[docs] - @classmethod - def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: - """Read an image file (or a collection of image files) and convert it into an image in numpy format - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) - - Args: - files: the path to the image file or a binary stream, or a collection of those - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - if isinstance(files, (str, Path, bytes)): - files = [files] - - return [read_img(file, **kwargs) for file in files]
-
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/_modules/doctr/io/elements.html b/v0.9.0/_modules/doctr/io/elements.html index 9a33fc8fba..a2f537c913 100644 --- a/v0.9.0/_modules/doctr/io/elements.html +++ b/v0.9.0/_modules/doctr/io/elements.html @@ -13,7 +13,7 @@ - + doctr.io.elements - docTR documentation @@ -996,7 +996,7 @@

Source code for doctr.io.elements

     
   
- + diff --git a/v0.9.0/_modules/doctr/io/html.html b/v0.9.0/_modules/doctr/io/html.html index 4524736555..9262f244e3 100644 --- a/v0.9.0/_modules/doctr/io/html.html +++ b/v0.9.0/_modules/doctr/io/html.html @@ -13,7 +13,7 @@ - + doctr.io.html - docTR documentation @@ -356,7 +356,7 @@

Source code for doctr.io.html

     
   
- + diff --git a/v0.9.0/_modules/doctr/io/image/base.html b/v0.9.0/_modules/doctr/io/image/base.html index 1b42de0506..54663fa868 100644 --- a/v0.9.0/_modules/doctr/io/image/base.html +++ b/v0.9.0/_modules/doctr/io/image/base.html @@ -13,7 +13,7 @@ - + doctr.io.image.base - docTR documentation @@ -382,7 +382,7 @@

Source code for doctr.io.image.base

     
   
- + diff --git a/v0.9.0/_modules/doctr/io/image/tensorflow.html b/v0.9.0/_modules/doctr/io/image/tensorflow.html index e428325472..d1b1c41b7f 100644 --- a/v0.9.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.9.0/_modules/doctr/io/image/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.io.image.tensorflow - docTR documentation @@ -441,7 +441,7 @@

Source code for doctr.io.image.tensorflow

     
   
- + diff --git a/v0.9.0/_modules/doctr/io/pdf.html b/v0.9.0/_modules/doctr/io/pdf.html index cb64f8eb89..1c88d93ff4 100644 --- a/v0.9.0/_modules/doctr/io/pdf.html +++ b/v0.9.0/_modules/doctr/io/pdf.html @@ -13,7 +13,7 @@ - + doctr.io.pdf - docTR documentation @@ -373,7 +373,7 @@

Source code for doctr.io.pdf

     
   
- + diff --git a/v0.9.0/_modules/doctr/io/reader.html b/v0.9.0/_modules/doctr/io/reader.html index 0a80b2867f..957d19e452 100644 --- a/v0.9.0/_modules/doctr/io/reader.html +++ b/v0.9.0/_modules/doctr/io/reader.html @@ -13,7 +13,7 @@ - + doctr.io.reader - docTR documentation @@ -422,7 +422,7 @@

Source code for doctr.io.reader

     
   
- + diff --git a/v0.9.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.9.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 1b97d83911..4dd332b464 100644 --- a/v0.9.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.magc_resnet.tensorflow - docTR documentation @@ -518,7 +518,7 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.9.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 6ee5272878..4504bdd58f 100644 --- a/v0.9.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.mobilenet.tensorflow - docTR documentation @@ -783,7 +783,7 @@

Source code for doctr.models.classification.mobilenet.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.9.0/_modules/doctr/models/classification/resnet/tensorflow.html index 67c7ede371..77a5747d8b 100644 --- a/v0.9.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.resnet.tensorflow - docTR documentation @@ -730,7 +730,7 @@

Source code for doctr.models.classification.resnet.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.9.0/_modules/doctr/models/classification/textnet/tensorflow.html index ef5264039a..c17b2f02e2 100644 --- a/v0.9.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.textnet.tensorflow - docTR documentation @@ -601,7 +601,7 @@

Source code for doctr.models.classification.textnet.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.9.0/_modules/doctr/models/classification/vgg/tensorflow.html index 57e34af78f..8dc381674b 100644 --- a/v0.9.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vgg.tensorflow - docTR documentation @@ -439,7 +439,7 @@

Source code for doctr.models.classification.vgg.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.9.0/_modules/doctr/models/classification/vit/tensorflow.html index 717a6d1649..84d68b5388 100644 --- a/v0.9.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.classification.vit.tensorflow - docTR documentation @@ -521,7 +521,7 @@

Source code for doctr.models.classification.vit.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/classification/zoo.html b/v0.9.0/_modules/doctr/models/classification/zoo.html index a1b54f64bb..36a1f0fb84 100644 --- a/v0.9.0/_modules/doctr/models/classification/zoo.html +++ b/v0.9.0/_modules/doctr/models/classification/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.classification.zoo - docTR documentation @@ -429,7 +429,7 @@

Source code for doctr.models.classification.zoo

<
- + diff --git a/v0.9.0/_modules/doctr/models/detection/differentiable_binarization.html b/v0.9.0/_modules/doctr/models/detection/differentiable_binarization.html deleted file mode 100644 index 38e9b36ec2..0000000000 --- a/v0.9.0/_modules/doctr/models/detection/differentiable_binarization.html +++ /dev/null @@ -1,879 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.differentiable_binarization - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.differentiable_binarization

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-import cv2
-from copy import deepcopy
-import numpy as np
-from shapely.geometry import Polygon
-import pyclipper
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from typing import Union, List, Tuple, Optional, Any, Dict
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
-from doctr.utils.repr import NestedObject
-
-__all__ = ['DBPostProcessor', 'DBNet', 'db_resnet50']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'db_resnet50': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'backbone': 'ResNet50',
-        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
-        'fpn_channels': 128,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'DBPostProcessor',
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
-    },
-}
-
-
-class DBPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for DBNet adapted from the implementation of `xuannianz
-    <https://github.com/xuannianz/DifferentiableBinarization>`_.
-
-    Args:
-        unclip ratio: ratio used to unshrink polygons
-        min_size_box: minimal length (pix) to keep a box
-        max_candidates: maximum boxes to consider in a single page
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        unclip_ratio: Union[float, int] = 1.5,
-        max_candidates: int = 1000,
-        box_thresh: float = 0.1,
-        bin_thresh: float = 0.3,
-    ) -> None:
-
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-        self.unclip_ratio = unclip_ratio
-        self.max_candidates = max_candidates
-
-    def polygon_to_box(
-        self,
-        points: np.ndarray,
-    ) -> Optional[Tuple[int, int, int, int]]:
-        """Expand a polygon (points) by a factor unclip_ratio, and returns a 4-points box
-
-        Args:
-            points: The first parameter.
-
-        Returns:
-            a box in absolute coordinates (x, y, w, h)
-        """
-        poly = Polygon(points)
-        distance = poly.area * self.unclip_ratio / poly.length  # compute distance to expand polygon
-        offset = pyclipper.PyclipperOffset()
-        offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        _points = offset.Execute(distance)
-        # Take biggest stack of points
-        idx = 0
-        if len(_points) > 1:
-            max_size = 0
-            for _idx, p in enumerate(_points):
-                if len(p) > max_size:
-                    idx = _idx
-                    max_size = len(p)
-            # We ensure that _points can be correctly casted to a ndarray
-            _points = [_points[idx]]
-        expanded_points = np.asarray(_points)  # expand polygon
-        if len(expanded_points) < 1:
-            return None
-        x, y, w, h = cv2.boundingRect(expanded_points)  # compute a 4-points box from expanded polygon
-        return x, y, w, h
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
-
-        Args:
-            pred: Pred map from differentiable binarization output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        # get contours from connected components on the bitmap
-        contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        for contour in contours[:self.max_candidates]:
-            # Check whether smallest enclosing bounding box is not too small
-            if np.any(contour[:, 0].max(axis=0) - contour[:, 0].min(axis=0) < min_size_box):
-                continue
-            x, y, w, h = cv2.boundingRect(contour)
-            points = np.array([[x, y], [x, y + h], [x + w, y + h], [x + w, y]])
-            # Compute objectness
-            score = self.box_score(pred, points)
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            _box = self.polygon_to_box(points)
-
-            if _box is None or _box[2] < min_size_box or _box[3] < min_size_box:  # remove to small boxes
-                continue
-            x, y, w, h = _box
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-class FeaturePyramidNetwork(layers.Layer, NestedObject):
-    """Feature Pyramid Network as described in `"Feature Pyramid Networks for Object Detection"
-    <https://arxiv.org/pdf/1612.03144.pdf>`_.
-
-    Args:
-        channels: number of channel to output
-    """
-
-    def __init__(
-        self,
-        channels: int,
-    ) -> None:
-        super().__init__()
-        self.channels = channels
-        self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest')
-        self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)]
-        self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)]
-
-    @staticmethod
-    def build_upsampling(
-        channels: int,
-        dilation_factor: int = 1,
-    ) -> layers.Layer:
-        """Module which performs a 3x3 convolution followed by up-sampling
-
-        Args:
-            channels: number of output channels
-            dilation_factor (int): dilation factor to scale the convolution output before concatenation
-
-        Returns:
-            a keras.layers.Layer object, wrapping these operations in a sequential module
-
-        """
-
-        _layers = conv_sequence(channels, 'relu', True, kernel_size=3)
-
-        if dilation_factor > 1:
-            _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest'))
-
-        module = keras.Sequential(_layers)
-
-        return module
-
-    def extra_repr(self) -> str:
-        return f"channels={self.channels}"
-
-    def call(
-        self,
-        x: List[tf.Tensor],
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # Channel mapping
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.inner_blocks, x)]
-        # Upsample & sum
-        for idx in range(len(results) - 1, -1):
-            results[idx] += self.upsample(results[idx + 1])
-        # Conv & upsample
-        results = [block(fmap, **kwargs) for block, fmap in zip(self.layer_blocks, results)]
-
-        return layers.concatenate(results)
-
-
-class DBNet(DetectionModel, NestedObject):
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_.
-
-    Args:
-        feature extractor: the backbone serving as feature extractor
-        fpn_channels: number of channels each extracted feature maps is mapped to
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: IntermediateLayerGetter,
-        fpn_channels: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(cfg=cfg)
-
-        self.shrink_ratio = 0.4
-        self.thresh_min = 0.3
-        self.thresh_max = 0.7
-        self.min_size_box = 3
-
-        self.feat_extractor = feature_extractor
-
-        self.fpn = FeaturePyramidNetwork(channels=fpn_channels)
-        # Initialize kernels
-        _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
-        output_shape = tuple(self.fpn(_inputs).shape)
-
-        self.probability_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-        self.threshold_head = keras.Sequential(
-            [
-                *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]),
-                layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'),
-                layers.BatchNormalization(),
-                layers.Activation('relu'),
-                layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'),
-            ]
-        )
-
-        self.postprocessor = DBPostProcessor()
-
-    @staticmethod
-    def compute_distance(
-        xs: np.array,
-        ys: np.array,
-        a: np.array,
-        b: np.array,
-        eps: float = 1e-7,
-    ) -> float:
-        """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
-
-        Args:
-            xs : map of x coordinates (height, width)
-            ys : map of y coordinates (height, width)
-            a: first point defining the [ab] segment
-            b: second point defining the [ab] segment
-
-        Returns:
-            The computed distance
-
-        """
-        square_dist_1 = np.square(xs - a[0]) + np.square(ys - a[1])
-        square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
-        square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
-        cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
-        square_sin = 1 - np.square(cosin)
-        square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
-        result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
-        return result
-
-    def draw_thresh_map(
-        self,
-        polygon: np.array,
-        canvas: np.array,
-        mask: np.array,
-    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """Draw a polygon treshold map on a canvas, as described in the DB paper
-
-        Args:
-            polygon : array of coord., to draw the boundary of the polygon
-            canvas : threshold map to fill with polygons
-            mask : mask for training on threshold polygons
-        """
-        if polygon.ndim != 2 or polygon.shape[1] != 2:
-            raise AttributeError("polygon should be a 2 dimensional array of coords")
-
-        # Augment polygon by shrink_ratio
-        polygon_shape = Polygon(polygon)
-        distance = polygon_shape.area * (1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
-        subject = [tuple(coor) for coor in polygon]  # Get coord as list of tuples
-        padding = pyclipper.PyclipperOffset()
-        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-        padded_polygon = np.array(padding.Execute(distance)[0])
-
-        # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
-
-        # Get min/max to recover polygon after distance computation
-        xmin = padded_polygon[:, 0].min()
-        xmax = padded_polygon[:, 0].max()
-        ymin = padded_polygon[:, 1].min()
-        ymax = padded_polygon[:, 1].max()
-        width = xmax - xmin + 1
-        height = ymax - ymin + 1
-        # Get absolute polygon for distance computation
-        polygon[:, 0] = polygon[:, 0] - xmin
-        polygon[:, 1] = polygon[:, 1] - ymin
-        # Get absolute padded polygon
-        xs = np.broadcast_to(np.linspace(0, width - 1, num=width).reshape(1, width), (height, width))
-        ys = np.broadcast_to(np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width))
-
-        # Compute distance map to fill the padded polygon
-        distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
-        for i in range(polygon.shape[0]):
-            j = (i + 1) % polygon.shape[0]
-            absolute_distance = self.compute_distance(xs, ys, polygon[i], polygon[j])
-            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
-        distance_map = np.min(distance_map, axis=0)
-
-        # Clip the padded polygon inside the canvas
-        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
-        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
-        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
-        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
-
-        # Fill the canvas with the distances computed inside the valid padded polygon
-        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
-            1 - distance_map[
-                ymin_valid - ymin:ymax_valid - ymin + 1,
-                xmin_valid - xmin:xmax_valid - xmin + 1
-            ],
-            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]
-        )
-
-        return polygon, canvas, mask
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.uint8)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-        thresh_target = np.zeros(output_shape, dtype=np.uint8)
-        thresh_mask = np.ones(output_shape, dtype=np.uint8)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            polys = np.stack([
-                abs_boxes[:, [0, 1]],
-                abs_boxes[:, [0, 3]],
-                abs_boxes[:, [2, 3]],
-                abs_boxes[:, [2, 1]],
-            ], axis=1)
-
-            for box, box_size, poly, is_ambiguous in zip(abs_boxes, boxes_size, polys, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-
-                # Negative shrink for gt, as described in paper
-                polygon = Polygon(poly)
-                distance = polygon.area * (1 - np.power(self.shrink_ratio, 2)) / polygon.length
-                subject = [tuple(coor) for coor in poly]
-                padding = pyclipper.PyclipperOffset()
-                padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                shrinked = padding.Execute(-distance)
-
-                # Draw polygon on gt if it is valid
-                if len(shrinked) == 0:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                    seg_mask[box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                cv2.fillPoly(seg_target[idx], [shrinked.astype(np.int32)], 1)
-
-                # Draw on both thresh map and thresh mask
-                poly, thresh_target[idx], thresh_mask[idx] = self.draw_thresh_map(poly, thresh_target[idx],
-                                                                                  thresh_mask[idx])
-
-        thresh_target = thresh_target.astype(np.float32) * (self.thresh_max - self.thresh_min) + self.thresh_min
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-        thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32)
-        thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask, thresh_target, thresh_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        thresh_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
-        and a list of masks for each image. From there it computes the loss with the model output
-
-        Args:
-            out_map: output feature map of the model of shape (N, H, W, C)
-            thresh_map: threshold map of shape (N, H, W, C)
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-
-        prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1]))
-        thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1]))
-
-        seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask]
-
-        neg_target = 1 - seg_target[seg_mask]
-        positive_count = tf.math.reduce_sum(seg_target[seg_mask])
-        negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count])
-        negative_loss = bce_loss * neg_target
-        negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32))
-        sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss)
-        balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-
-        # Compute dice loss for approxbin_map
-        bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask])))
-
-        bce_min = tf.math.reduce_min(bce_loss)
-        weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.
-        inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights)
-        union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8
-        dice_loss = 1 - 2.0 * inter / union
-
-        # Compute l1 loss for thresh_map
-        l1_scale = 10.
-        if tf.reduce_any(thresh_mask):
-            l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
-        else:
-            l1_loss = tf.constant(0.)
-
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        feat_maps = self.feat_extractor(x, **kwargs)
-        feat_concat = self.fpn(feat_maps, **kwargs)
-        logits = self.probability_head(feat_concat, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            thresh_map = self.threshold_head(feat_concat, **kwargs)
-            loss = self.compute_loss(logits, thresh_map, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels'])
-
-    # Feature extractor
-    resnet = tf.keras.applications.__dict__[_cfg['backbone']](
-        include_top=False,
-        weights=None,
-        input_shape=_cfg['input_shape'],
-        pooling=None,
-    )
-
-    feat_extractor = IntermediateLayerGetter(
-        resnet,
-        _cfg['fpn_layers'],
-    )
-
-    kwargs['fpn_channels'] = _cfg['fpn_channels']
-
-    # Build the model
-    model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet('db_resnet50', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.9.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index b3523b2fb5..5cf3b58dbb 100644 --- a/v0.9.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.differentiable_binarization.tensorflow - docTR documentation @@ -731,7 +731,7 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo

- + diff --git a/v0.9.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.9.0/_modules/doctr/models/detection/fast/tensorflow.html index 81a4fa8f54..3e2da22214 100644 --- a/v0.9.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.fast.tensorflow - docTR documentation @@ -762,7 +762,7 @@

Source code for doctr.models.detection.fast.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/detection/linknet.html b/v0.9.0/_modules/doctr/models/detection/linknet.html deleted file mode 100644 index 129cfdce8b..0000000000 --- a/v0.9.0/_modules/doctr/models/detection/linknet.html +++ /dev/null @@ -1,644 +0,0 @@ - - - - - - - - - - - - doctr.models.detection.linknet - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.detection.linknet

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-# Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
-
-from copy import deepcopy
-import tensorflow as tf
-import numpy as np
-import cv2
-from tensorflow.keras import layers, Sequential
-from typing import Dict, Any, Tuple, Optional, List
-
-from .core import DetectionModel, DetectionPostProcessor
-from ..backbones import ResnetStage
-from ..utils import conv_sequence, load_pretrained_params
-from ...utils.repr import NestedObject
-
-__all__ = ['LinkNet', 'linknet', 'LinkNetPostProcessor']
-
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'linknet': {
-        'mean': (0.798, 0.785, 0.772),
-        'std': (0.264, 0.2749, 0.287),
-        'out_chan': 1,
-        'input_shape': (1024, 1024, 3),
-        'post_processor': 'LinkNetPostProcessor',
-        'url': None,
-    },
-}
-
-
-class LinkNetPostProcessor(DetectionPostProcessor):
-    """Implements a post processor for LinkNet model.
-
-    Args:
-        min_size_box: minimal length (pix) to keep a box
-        box_thresh: minimal objectness score to consider a box
-        bin_thresh: threshold used to binzarized p_map at inference time
-
-    """
-    def __init__(
-        self,
-        min_size_box: int = 3,
-        bin_thresh: float = 0.15,
-        box_thresh: float = 0.1,
-    ) -> None:
-        super().__init__(
-            box_thresh,
-            bin_thresh
-        )
-
-    def bitmap_to_boxes(
-        self,
-        pred: np.ndarray,
-        bitmap: np.ndarray,
-    ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
-
-        Args:
-            pred: Pred map from differentiable linknet output
-            bitmap: Bitmap map computed from pred (binarized)
-
-        Returns:
-            np tensor boxes for the bitmap, each box is a 5-element list
-                containing x, y, w, h, score for the box
-        """
-        label_num, labelimage = cv2.connectedComponents(bitmap.astype(np.uint8), connectivity=4)
-        height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
-        boxes = []
-        for label in range(1, label_num + 1):
-            points = np.array(np.where(labelimage == label)[::-1]).T
-            if points.shape[0] < 4:  # remove polygons with 3 points or less
-                continue
-            score = self.box_score(pred, points.reshape(-1, 2))
-            if self.box_thresh > score:   # remove polygons with a weak objectness
-                continue
-            x, y, w, h = cv2.boundingRect(points)
-            if min(w, h) < min_size_box:  # filter too small boxes
-                continue
-            # compute relative polygon to get rid of img shape
-            xmin, ymin, xmax, ymax = x / width, y / height, (x + w) / width, (y + h) / height
-            boxes.append([xmin, ymin, xmax, ymax, score])
-        return np.clip(np.asarray(boxes), 0, 1) if len(boxes) > 0 else np.zeros((0, 5), dtype=np.float32)
-
-
-def decoder_block(in_chan: int, out_chan: int) -> Sequential:
-    """Creates a LinkNet decoder block"""
-
-    return Sequential([
-        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
-        layers.Conv2DTranspose(
-            filters=in_chan // 4,
-            kernel_size=3,
-            strides=2,
-            padding="same",
-            use_bias=False,
-            kernel_initializer='he_normal'
-        ),
-        layers.BatchNormalization(),
-        layers.Activation('relu'),
-        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
-    ])
-
-
-class LinkNetFPN(layers.Layer, NestedObject):
-    """LinkNet Encoder-Decoder module
-
-    """
-
-    def __init__(
-        self,
-    ) -> None:
-
-        super().__init__()
-        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
-        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
-        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
-        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
-        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
-        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
-        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
-        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
-
-    def call(
-        self,
-        x: tf.Tensor
-    ) -> tf.Tensor:
-        x_1 = self.encoder_1(x)
-        x_2 = self.encoder_2(x_1)
-        x_3 = self.encoder_3(x_2)
-        x_4 = self.encoder_4(x_3)
-        y_4 = self.decoder_4(x_4)
-        y_3 = self.decoder_3(y_4 + x_3)
-        y_2 = self.decoder_2(y_3 + x_2)
-        y_1 = self.decoder_1(y_2 + x_1)
-        return y_1
-
-
-class LinkNet(DetectionModel, NestedObject):
-    """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation"
-    <https://arxiv.org/pdf/1707.03718.pdf>`_.
-
-    Args:
-        out_chan: number of channels for the output
-    """
-
-    _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor']
-
-    def __init__(
-        self,
-        out_chan: int = 1,
-        input_shape: Tuple[int, int, int] = (512, 512, 3),
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(cfg=cfg)
-
-        self.stem = Sequential([
-            *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape),
-            layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'),
-        ])
-
-        self.fpn = LinkNetFPN()
-
-        self.classifier = Sequential([
-            layers.Conv2DTranspose(
-                filters=32,
-                kernel_size=3,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-            layers.BatchNormalization(),
-            layers.Activation('relu'),
-            *conv_sequence(32, 'relu', True, strides=1, kernel_size=3),
-            layers.Conv2DTranspose(
-                filters=out_chan,
-                kernel_size=2,
-                strides=2,
-                padding="same",
-                use_bias=False,
-                kernel_initializer='he_normal'
-            ),
-        ])
-
-        self.min_size_box = 3
-
-        self.postprocessor = LinkNetPostProcessor()
-
-    def compute_target(
-        self,
-        target: List[Dict[str, Any]],
-        output_shape: Tuple[int, int, int],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-
-        seg_target = np.zeros(output_shape, dtype=np.bool)
-        seg_mask = np.ones(output_shape, dtype=np.bool)
-
-        for idx, _target in enumerate(target):
-            # Draw each polygon on gt
-            if _target['boxes'].shape[0] == 0:
-                # Empty image, full masked
-                seg_mask[idx] = False
-
-            # Absolute bounding boxes
-            abs_boxes = _target['boxes'].copy()
-            abs_boxes[:, [0, 2]] *= output_shape[-1]
-            abs_boxes[:, [1, 3]] *= output_shape[-2]
-            abs_boxes = abs_boxes.round().astype(np.int32)
-
-            boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-
-            for box, box_size, is_ambiguous in zip(abs_boxes, boxes_size, _target['flags']):
-                # Mask ambiguous boxes
-                if is_ambiguous:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Mask boxes that are too small
-                if box_size < self.min_size_box:
-                    seg_mask[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = False
-                    continue
-                # Fill polygon with 1
-                seg_target[idx, box[1]: box[3] + 1, box[0]: box[2] + 1] = True
-
-        seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32)
-        seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool)
-
-        return seg_target, seg_mask
-
-    def compute_loss(
-        self,
-        out_map: tf.Tensor,
-        target: List[Dict[str, Any]]
-    ) -> tf.Tensor:
-        """Compute a batch of gts and masks from a list of boxes and a list of masks for each image
-        Then, it computes the loss function with proba_map, gts and masks
-
-        Args:
-            out_map: output feature map of the model of shape N x H x W x 1
-            target: list of dictionary where each dict has a `boxes` and a `flags` entry
-
-        Returns:
-            A loss tensor
-        """
-        seg_target, seg_mask = self.compute_target(target, out_map.shape[:3])
-
-        # Compute BCE loss
-        return tf.math.reduce_mean(tf.keras.losses.binary_crossentropy(
-            seg_target[seg_mask],
-            tf.squeeze(out_map, axis=[-1])[seg_mask],
-            from_logits=True
-        ))
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[Dict[str, Any]]] = None,
-        return_model_output: bool = False,
-        return_boxes: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        logits = self.stem(x)
-        logits = self.fpn(logits)
-        logits = self.classifier(logits)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output or target is None or return_boxes:
-            prob_map = tf.math.sigmoid(logits)
-        if return_model_output:
-            out["out_map"] = prob_map
-
-        if target is None or return_boxes:
-            # Post-process boxes
-            out["boxes"] = self.postprocessor(prob_map)
-
-        if target is not None:
-            loss = self.compute_loss(logits, target)
-            out['loss'] = loss
-
-        return out
-
-
-def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['out_chan'] = kwargs.get('out_chan', _cfg['out_chan'])
-
-    kwargs['out_chan'] = _cfg['out_chan']
-    kwargs['input_shape'] = _cfg['input_shape']
-    # Build the model
-    model = LinkNet(cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def linknet(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import linknet - >>> model = linknet(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet('linknet', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.9.0/_modules/doctr/models/detection/linknet/tensorflow.html index c5fd053513..d374bb6d1e 100644 --- a/v0.9.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.detection.linknet.tensorflow - docTR documentation @@ -698,7 +698,7 @@

Source code for doctr.models.detection.linknet.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/detection/zoo.html b/v0.9.0/_modules/doctr/models/detection/zoo.html index 26a8767a16..345175018e 100644 --- a/v0.9.0/_modules/doctr/models/detection/zoo.html +++ b/v0.9.0/_modules/doctr/models/detection/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.detection.zoo - docTR documentation @@ -431,7 +431,7 @@

Source code for doctr.models.detection.zoo

     
   
- + diff --git a/v0.9.0/_modules/doctr/models/export.html b/v0.9.0/_modules/doctr/models/export.html deleted file mode 100644 index f25a81aa21..0000000000 --- a/v0.9.0/_modules/doctr/models/export.html +++ /dev/null @@ -1,411 +0,0 @@ - - - - - - - - - - - - doctr.models.export - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.export

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import logging
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import Model
-from typing import Tuple
-
-logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-
-
-__all__ = ['convert_to_tflite', 'convert_to_fp16', 'quantize_model']
-
-
-
-[docs] -def convert_to_tflite(tf_model: Model) -> bytes: - """Converts a model to TFLite format - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_tflite, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_tflite(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - return converter.convert()
- - - -
-[docs] -def convert_to_fp16(tf_model: Model) -> bytes: - """Converts a model to half precision - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import convert_to_fp16, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = convert_to_fp16(model) - - Args: - tf_model: a keras model - - Returns: - bytes: the serialized FP16 model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - return converter.convert()
- - - -
-[docs] -def quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) -> bytes: - """Quantize a Tensorflow model - - Example:: - >>> from tensorflow.keras import Sequential - >>> from doctr.models import quantize_model, conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> serialized_model = quantize_model(model, (224, 224, 3)) - - Args: - tf_model: a keras model - input_shape: shape of the expected input tensor (excluding batch dimension) with channel last order - - Returns: - bytes: the serialized quantized model - """ - converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # Float fallback for operators that do not have an integer implementation - def representative_dataset(): - for _ in range(100): - data = np.random.rand(1, *input_shape) - yield [data.astype(np.float32)] - - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - - return converter.convert()
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/_modules/doctr/models/factory/hub.html b/v0.9.0/_modules/doctr/models/factory/hub.html index 6161edb5f2..3e02d45cd1 100644 --- a/v0.9.0/_modules/doctr/models/factory/hub.html +++ b/v0.9.0/_modules/doctr/models/factory/hub.html @@ -13,7 +13,7 @@ - + doctr.models.factory.hub - docTR documentation @@ -565,7 +565,7 @@

Source code for doctr.models.factory.hub

     
   
- + diff --git a/v0.9.0/_modules/doctr/models/recognition/crnn.html b/v0.9.0/_modules/doctr/models/recognition/crnn.html deleted file mode 100644 index daa2393439..0000000000 --- a/v0.9.0/_modules/doctr/models/recognition/crnn.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.crnn - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.crnn

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import layers
-from tensorflow.keras.models import Sequential
-from typing import Tuple, Dict, Any, Optional, List
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel, RecognitionPostProcessor
-
-__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'crnn_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
-    },
-    'crnn_resnet31': {
-        'mean': (0.694, 0.695, 0.693),
-        'std': (0.299, 0.296, 0.301),
-        'backbone': 'resnet31', 'rnn_units': 128,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'CTCPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
-    },
-}
-
-
-class CTCPostProcessor(RecognitionPostProcessor):
-    """
-    Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor
-    ) -> List[Tuple[str, float]]:
-        """
-        Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
-
-        Args:
-            logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1
-
-        Returns:
-            A list of decoded words of length BATCH_SIZE
-
-        """
-        # Decode CTC
-        _decoded, _log_prob = tf.nn.ctc_beam_search_decoder(
-            tf.transpose(logits, perm=[1, 0, 2]),
-            tf.fill(logits.shape[0], logits.shape[1]),
-            beam_width=1, top_paths=1,
-        )
-        out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab))
-        probs = tf.math.exp(tf.squeeze(_log_prob, axis=1))
-
-        # Map it to characters
-        _decoded_strings_pred = tf.strings.reduce_join(
-            inputs=tf.nn.embedding_lookup(self._embedding, out_idxs),
-            axis=-1
-        )
-        _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-class CRNN(RecognitionModel):
-    """Implements a CRNN architecture as described in `"An End-to-End Trainable Neural Network for Image-based
-    Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of units in the LSTM layers
-        cfg: configuration dictionary
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor: tf.keras.Model,
-        vocab: str,
-        rnn_units: int = 128,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(vocab=vocab, cfg=cfg)
-        self.feat_extractor = feature_extractor
-
-        # Initialize kernels
-        h, w, c = self.feat_extractor.output_shape[1:]
-        self.max_length = w
-
-        self.decoder = Sequential(
-            [
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)),
-                layers.Dense(units=len(vocab) + 1)
-            ]
-        )
-        self.decoder.build(input_shape=(None, w, h * c))
-
-        self.postprocessor = CTCPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        target: List[str],
-    ) -> tf.Tensor:
-        """Compute CTC loss for the model.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        gt, seq_len = self.compute_target(target)
-        batch_len = model_output.shape[0]
-        input_length = model_output.shape[1] * tf.ones(shape=(batch_len))
-        ctc_loss = tf.nn.ctc_loss(
-            gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab)
-        )
-        return ctc_loss
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        # B x H x W x C --> B x W x H x C
-        transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3])
-        w, h, c = transposed_feat.get_shape().as_list()[1:]
-        # B x W x H x C --> B x W x H * C
-        features_seq = tf.reshape(transposed_feat, shape=(-1, w, h * c))
-        logits = self.decoder(features_seq, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = logits
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(logits)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(logits, target)
-
-        return out
-
-
-def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[_cfg['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-
-    # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, _cfg['url'])
-
-    return model
-
-
-
-[docs] -def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
- - - -def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_resnet31 - >>> model = crnn_resnet31(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_resnet31', pretrained, **kwargs) -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.9.0/_modules/doctr/models/recognition/crnn/tensorflow.html index a00647e1b2..a8a19605ba 100644 --- a/v0.9.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.crnn.tensorflow - docTR documentation @@ -650,7 +650,7 @@

Source code for doctr.models.recognition.crnn.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.9.0/_modules/doctr/models/recognition/master/tensorflow.html index 446786da5f..fa02c4de73 100644 --- a/v0.9.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.master.tensorflow - docTR documentation @@ -644,7 +644,7 @@

Source code for doctr.models.recognition.master.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.9.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 3e2d71751a..3c719b14fe 100644 --- a/v0.9.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.parseq.tensorflow - docTR documentation @@ -840,7 +840,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/recognition/sar.html b/v0.9.0/_modules/doctr/models/recognition/sar.html deleted file mode 100644 index 2482e9f156..0000000000 --- a/v0.9.0/_modules/doctr/models/recognition/sar.html +++ /dev/null @@ -1,712 +0,0 @@ - - - - - - - - - - - - doctr.models.recognition.sar - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.models.recognition.sar

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-from copy import deepcopy
-import tensorflow as tf
-from tensorflow.keras import Sequential, layers
-from typing import Tuple, Dict, List, Any, Optional
-
-from .. import backbones
-from ..utils import load_pretrained_params
-from .core import RecognitionModel
-from .core import RecognitionPostProcessor
-from doctr.utils.repr import NestedObject
-
-__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
-
-default_cfgs: Dict[str, Dict[str, Any]] = {
-    'sar_vgg16_bn': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
-    },
-    'sar_resnet31': {
-        'mean': (.5, .5, .5),
-        'std': (1., 1., 1.),
-        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
-        'input_shape': (32, 128, 3),
-        'post_processor': 'SARPostProcessor',
-        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
-                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
-        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
-    },
-}
-
-
-class AttentionModule(layers.Layer, NestedObject):
-    """Implements attention module of the SAR model
-
-    Args:
-        attention_units: number of hidden attention units
-
-    """
-    def __init__(
-        self,
-        attention_units: int
-    ) -> None:
-
-        super().__init__()
-        self.hidden_state_projector = layers.Conv2D(
-            attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal',
-        )
-        self.features_projector = layers.Conv2D(
-            attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal',
-        )
-        self.attention_projector = layers.Conv2D(
-            1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal',
-        )
-        self.flatten = layers.Flatten()
-
-    def call(
-        self,
-        features: tf.Tensor,
-        hidden_state: tf.Tensor,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        [H, W] = features.get_shape().as_list()[1:3]
-        # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units)
-        hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs)
-        # shape (N, H, W, vgg_units) -> (N, H, W, attention_units)
-        features_projection = self.features_projector(features, **kwargs)
-        projection = tf.math.tanh(hidden_state_projection + features_projection)
-        # shape (N, H, W, attention_units) -> (N, H, W, 1)
-        attention = self.attention_projector(projection, **kwargs)
-        # shape (N, H, W, 1) -> (N, H * W)
-        attention = self.flatten(attention)
-        attention = tf.nn.softmax(attention)
-        # shape (N, H * W) -> (N, H, W, 1)
-        attention_map = tf.reshape(attention, [-1, H, W, 1])
-        glimpse = tf.math.multiply(features, attention_map)
-        # shape (N, H * W) -> (N, 1)
-        glimpse = tf.reduce_sum(glimpse, axis=[1, 2])
-        return glimpse
-
-
-class SARDecoder(layers.Layer, NestedObject):
-    """Implements decoder module of the SAR model
-
-    Args:
-        rnn_units: number of hidden units in recurrent cells
-        max_length: maximum length of a sequence
-        vocab_size: number of classes in the model alphabet
-        embedding_units: number of hidden embedding units
-        attention_units: number of hidden attention units
-        num_decoder_layers: number of LSTM layers to stack
-
-    """
-    def __init__(
-        self,
-        rnn_units: int,
-        max_length: int,
-        vocab_size: int,
-        embedding_units: int,
-        attention_units: int,
-        num_decoder_layers: int = 2,
-        input_shape: Optional[List[Tuple[Optional[int]]]] = None,
-    ) -> None:
-
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.lstm_decoder = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)]
-        )
-        self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1))
-        self.attention_module = AttentionModule(attention_units)
-        self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units))
-        self.max_length = max_length
-
-        # Initialize kernels
-        if input_shape is not None:
-            self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units)))
-
-    def call(
-        self,
-        features: tf.Tensor,
-        holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
-        **kwargs: Any,
-    ) -> tf.Tensor:
-
-        # initialize states (each of shape (N, rnn_units))
-        states = self.lstm_decoder.get_initial_state(
-            inputs=None, batch_size=features.shape[0], dtype=tf.float32
-        )
-        # run first step of lstm
-        # holistic: shape (N, rnn_units)
-        _, states = self.lstm_decoder(holistic, states, **kwargs)
-        # Initialize with the index of virtual START symbol (placed after <eos>)
-        symbol = tf.fill(features.shape[0], self.vocab_size + 1)
-        logits_list = []
-        if kwargs.get('training') and gt is None:
-            raise ValueError('Need to provide labels during training for teacher forcing')
-        for t in range(self.max_length + 1):  # keep 1 step for <eos>
-            # one-hot symbol with depth vocab_size + 1
-            # embeded_symbol: shape (N, embedding_units)
-            embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs)
-            logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs)
-            glimpse = self.attention_module(
-                features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs,
-            )
-            # logits: shape (N, rnn_units), glimpse: shape (N, 1)
-            logits = tf.concat([logits, glimpse], axis=-1)
-            # shape (N, rnn_units + 1) -> (N, vocab_size + 1)
-            logits = self.output_dense(logits, **kwargs)
-            # update symbol with predicted logits for t+1 step
-            if kwargs.get('training'):
-                symbol = gt[:, t]
-            else:
-                symbol = tf.argmax(logits, axis=-1)
-            logits_list.append(logits)
-        outputs = tf.stack(logits_list, axis=1)  # shape (N, max_length + 1, vocab_size + 1)
-
-        return outputs
-
-
-class SAR(RecognitionModel):
-    """Implements a SAR architecture as described in `"Show, Attend and Read:A Simple and Strong Baseline for
-    Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
-
-    Args:
-        feature_extractor: the backbone serving as feature extractor
-        vocab: vocabulary used for encoding
-        rnn_units: number of hidden units in both encoder and decoder LSTM
-        embedding_units: number of embedding units
-        attention_units: number of hidden units in attention module
-        max_length: maximum word length handled by the model
-        num_decoders: number of LSTM to stack in decoder layer
-
-    """
-
-    _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor']
-
-    def __init__(
-        self,
-        feature_extractor,
-        vocab: str,
-        rnn_units: int = 512,
-        embedding_units: int = 512,
-        attention_units: int = 512,
-        max_length: int = 30,
-        num_decoders: int = 2,
-        cfg: Optional[Dict[str, Any]] = None,
-    ) -> None:
-
-        super().__init__(vocab=vocab, cfg=cfg)
-
-        self.max_length = max_length + 1  # Add 1 timestep for EOS after the longest word
-
-        self.feat_extractor = feature_extractor
-
-        self.encoder = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True),
-                layers.LSTM(units=rnn_units, return_sequences=False)
-            ]
-        )
-        # Initialize the kernels (watch out for reduce_max)
-        self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:])
-
-        self.decoder = SARDecoder(
-            rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders,
-            input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape]
-        )
-
-        self.postprocessor = SARPostProcessor(vocab=vocab)
-
-    def compute_loss(
-        self,
-        model_output: tf.Tensor,
-        gt: tf.Tensor,
-        seq_len: tf.Tensor,
-    ) -> tf.Tensor:
-        """Compute categorical cross-entropy loss for the model.
-        Sequences are masked after the EOS character.
-
-        Args:
-            gt: the encoded tensor with gt labels
-            model_output: predicted logits of the model
-            seq_len: lengths of each gt word inside the batch
-
-        Returns:
-            The loss of the model on the batch
-        """
-        # Input length : number of timesteps
-        input_len = tf.shape(model_output)[1]
-        # Add one for additional <eos> token
-        seq_len = seq_len + 1
-        # One-hot gt labels
-        oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
-        # Compute loss
-        cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt, model_output)
-        # Compute mask
-        mask_values = tf.zeros_like(cce)
-        mask_2d = tf.sequence_mask(seq_len, input_len)
-        masked_loss = tf.where(mask_2d, cce, mask_values)
-        ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32))
-        return tf.expand_dims(ce_loss, axis=1)
-
-    def call(
-        self,
-        x: tf.Tensor,
-        target: Optional[List[str]] = None,
-        return_model_output: bool = False,
-        return_preds: bool = False,
-        **kwargs: Any,
-    ) -> Dict[str, Any]:
-
-        features = self.feat_extractor(x, **kwargs)
-        pooled_features = tf.reduce_max(features, axis=1)  # vertical max pooling
-        encoded = self.encoder(pooled_features, **kwargs)
-        if target is not None:
-            gt, seq_len = self.compute_target(target)
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
-
-        out: Dict[str, tf.Tensor] = {}
-        if return_model_output:
-            out["out_map"] = decoded_features
-
-        if target is None or return_preds:
-            # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
-
-        if target is not None:
-            out['loss'] = self.compute_loss(decoded_features, gt, seq_len)
-
-        return out
-
-
-class SARPostProcessor(RecognitionPostProcessor):
-    """Post processor for SAR architectures
-
-    Args:
-        vocab: string containing the ordered sequence of supported characters
-        ignore_case: if True, ignore case of letters
-        ignore_accents: if True, ignore accents of letters
-    """
-
-    def __call__(
-        self,
-        logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
-        # compute pred with argmax for attention models
-        out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
-
-        # decode raw output of the model with tf_label_to_idx
-        out_idxs = tf.cast(out_idxs, dtype='int32')
-        decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(self._embedding, out_idxs), axis=-1)
-        decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
-        decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0]
-        word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-
-        return list(zip(word_values, probs.numpy().tolist()))
-
-
-def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR:
-
-    # Patch the config
-    _cfg = deepcopy(default_cfgs[arch])
-    _cfg['input_shape'] = input_shape or _cfg['input_shape']
-    _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab'])
-    _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units'])
-    _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units'])
-    _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units'])
-    _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length'])
-    _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders'])
-
-    # Feature extractor
-    feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']](
-        input_shape=_cfg['input_shape'],
-        include_top=False,
-    )
-
-    kwargs['vocab'] = _cfg['vocab']
-    kwargs['rnn_units'] = _cfg['rnn_units']
-    kwargs['embedding_units'] = _cfg['embedding_units']
-    kwargs['attention_units'] = _cfg['attention_units']
-    kwargs['max_length'] = _cfg['max_length']
-    kwargs['num_decoders'] = _cfg['num_decoders']
-
-    # Build the model
-    model = SAR(feat_extractor, cfg=_cfg, **kwargs)
-    # Load pretrained parameters
-    if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]['url'])
-
-    return model
-
-
-
-[docs] -def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import sar_vgg16_bn - >>> model = sar_vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_vgg16_bn', pretrained, **kwargs)
- - - -
-[docs] -def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: - """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong - Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - - Example: - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _sar('sar_resnet31', pretrained, **kwargs)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.9.0/_modules/doctr/models/recognition/sar/tensorflow.html index 60d2b9b4bd..a432db0986 100644 --- a/v0.9.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.sar.tensorflow - docTR documentation @@ -749,7 +749,7 @@

Source code for doctr.models.recognition.sar.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.9.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 1a97114efa..7131ac4a5b 100644 --- a/v0.9.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.9.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.vitstr.tensorflow - docTR documentation @@ -610,7 +610,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/models/recognition/zoo.html b/v0.9.0/_modules/doctr/models/recognition/zoo.html index a48a001041..cb3c812d02 100644 --- a/v0.9.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.9.0/_modules/doctr/models/recognition/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.recognition.zoo - docTR documentation @@ -403,7 +403,7 @@

Source code for doctr.models.recognition.zoo

   
- + diff --git a/v0.9.0/_modules/doctr/models/zoo.html b/v0.9.0/_modules/doctr/models/zoo.html index d8485a1e3e..324b382fcf 100644 --- a/v0.9.0/_modules/doctr/models/zoo.html +++ b/v0.9.0/_modules/doctr/models/zoo.html @@ -13,7 +13,7 @@ - + doctr.models.zoo - docTR documentation @@ -572,7 +572,7 @@

Source code for doctr.models.zoo

     
   
- + diff --git a/v0.9.0/_modules/doctr/transforms/modules.html b/v0.9.0/_modules/doctr/transforms/modules.html deleted file mode 100644 index ba8269e7ef..0000000000 --- a/v0.9.0/_modules/doctr/transforms/modules.html +++ /dev/null @@ -1,734 +0,0 @@ - - - - - - - - - - - - doctr.transforms.modules - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.transforms.modules

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import random
-import tensorflow as tf
-from typing import List, Any, Tuple, Callable
-
-from doctr.utils.repr import NestedObject
-from . import functional as F
-
-
-__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'ColorInversion',
-           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality',
-           'OneOf', 'RandomApply']
-
-
-
-[docs] -class Compose(NestedObject): - """Implements a wrapper that will apply transformations sequentially - - Example:: - >>> from doctr.transforms import Compose, Resize - >>> import tensorflow as tf - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, x: Any) -> Any: - for t in self.transforms: - x = t(x) - - return x
- - - -
-[docs] -class Resize(NestedObject): - """Resizes a tensor to a target size - - Example:: - >>> from doctr.transforms import Resize - >>> import tensorflow as tf - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - output_size: expected output size - method: interpolation method - preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros - symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically - """ - def __init__( - self, - output_size: Tuple[int, int], - method: str = 'bilinear', - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = False, - ) -> None: - self.output_size = output_size - self.method = method - self.preserve_aspect_ratio = preserve_aspect_ratio - self.symmetric_pad = symmetric_pad - - def extra_repr(self) -> str: - _repr = f"output_size={self.output_size}, method='{self.method}'" - if self.preserve_aspect_ratio: - _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" - return _repr - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) - if self.preserve_aspect_ratio: - # pad width - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return img
- - - -
-[docs] -class Normalize(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - mean: average value per channel - std: standard deviation per channel - """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean, dtype=tf.float32) - self.std = tf.constant(std, dtype=tf.float32) - - def extra_repr(self) -> str: - return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= self.mean - img /= self.std - return img
- - - -
-[docs] -class LambdaTransformation(NestedObject): - """Normalize a tensor to a Gaussian distribution for each channel - - Example:: - >>> from doctr.transforms import LambdaTransformation - >>> import tensorflow as tf - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - fn: the function to be applied to the input tensor - """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: - self.fn = fn - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return self.fn(img)
- - - -
-[docs] -class ToGray(NestedObject): - """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - """ - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.rgb_to_grayscale(img)
- - - -
-[docs] -class ColorInversion(NestedObject): - """Applies the following tranformation to a tensor (image or batch of images): - convert to grayscale, colorize (shift 0-values randomly), and then invert colors - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_val: range [min_val, 1] to colorize RGB pixels - """ - def __init__(self, min_val: float = 0.6) -> None: - self.min_val = min_val - - def extra_repr(self) -> str: - return f"min_val={self.min_val}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return F.invert_colors(img, self.min_val)
- - - -
-[docs] -class RandomBrightness(NestedObject): - """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta - to all pixels - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Brightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - p: probability to apply transformation - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_brightness(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomContrast(NestedObject): - """Randomly adjust contrast of a tensor (batch of images or image) by adjusting - each pixel: (img - mean) * contrast_factor + mean. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Contrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) - """ - def __init__(self, delta: float = .3) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_contrast(img, lower=1 - self.delta, upper=1 / (1 - self.delta))
- - - -
-[docs] -class RandomSaturation(NestedObject): - """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and - increasing saturation by a factor. - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Saturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) - """ - def __init__(self, delta: float = .5) -> None: - self.delta = delta - - def extra_repr(self) -> str: - return f"delta={self.delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_saturation(img, lower=1 - self.delta, upper=1 + self.delta)
- - - -
-[docs] -class RandomHue(NestedObject): - """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Hue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] - """ - def __init__(self, max_delta: float = 0.3) -> None: - self.max_delta = max_delta - - def extra_repr(self) -> str: - return f"max_delta={self.max_delta}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_hue(img, max_delta=self.max_delta)
- - - -
-[docs] -class RandomGamma(NestedObject): - """randomly performs gamma correction for a tensor (batch of images or image) - - Example: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = Gamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - Args: - min_gamma: non-negative real number, lower bound for gamma param - max_gamma: non-negative real number, upper bound for gamma - min_gain: lower bound for constant multiplier - max_gain: upper bound for constant multiplier - """ - def __init__( - self, - min_gamma: float = 0.5, - max_gamma: float = 1.5, - min_gain: float = 0.8, - max_gain: float = 1.2, - ) -> None: - self.min_gamma = min_gamma - self.max_gamma = max_gamma - self.min_gain = min_gain - self.max_gain = max_gain - - def extra_repr(self) -> str: - return f"""gamma_range=({self.min_gamma}, {self.max_gamma}), - gain_range=({self.min_gain}, {self.max_gain})""" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - gamma = random.uniform(self.min_gamma, self.max_gamma) - gain = random.uniform(self.min_gain, self.max_gain) - return tf.image.adjust_gamma(img, gamma=gamma, gain=gain)
- - - -
-[docs] -class RandomJpegQuality(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = JpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - min_quality: int between [0, 100] - max_quality: int between [0, 100] - """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: - self.min_quality = min_quality - self.max_quality = max_quality - - def extra_repr(self) -> str: - return f"min_quality={self.min_quality}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality( - img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality - )
- - - -
-[docs] -class OneOf(NestedObject): - """Randomly apply one of the input transformations - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transforms: list of transformations, one only will be picked - """ - - _children_names: List[str] = ['transforms'] - - def __init__(self, transforms: List[NestedObject]) -> None: - self.transforms = transforms - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - # Pick transformation - transfo = self.transforms[int(random.random() * len(self.transforms))] - # Apply - return transfo(img)
- - - -
-[docs] -class RandomApply(NestedObject): - """Apply with a probability p the input transformation - - Example:: - >>> from doctr.transforms import Normalize - >>> import tensorflow as tf - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - transform: transformation to apply - p: probability to apply - """ - def __init__(self, transform: NestedObject, p: float = .5) -> None: - self.transform = transform - self.p = p - - def extra_repr(self) -> str: - return f"transform={self.transform}, p={self.p}" - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - if random.random() < self.p: - return self.transform(img) - return img
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/_modules/doctr/transforms/modules/base.html b/v0.9.0/_modules/doctr/transforms/modules/base.html index c49bc667c1..caf55a50e3 100644 --- a/v0.9.0/_modules/doctr/transforms/modules/base.html +++ b/v0.9.0/_modules/doctr/transforms/modules/base.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.base - docTR documentation @@ -639,7 +639,7 @@

Source code for doctr.transforms.modules.base

- + diff --git a/v0.9.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.9.0/_modules/doctr/transforms/modules/tensorflow.html index 69301ece9b..752fb56149 100644 --- a/v0.9.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.9.0/_modules/doctr/transforms/modules/tensorflow.html @@ -13,7 +13,7 @@ - + doctr.transforms.modules.tensorflow - docTR documentation @@ -949,7 +949,7 @@

Source code for doctr.transforms.modules.tensorflow

- + diff --git a/v0.9.0/_modules/doctr/utils/metrics.html b/v0.9.0/_modules/doctr/utils/metrics.html index 2fa0f40a60..678580fbb3 100644 --- a/v0.9.0/_modules/doctr/utils/metrics.html +++ b/v0.9.0/_modules/doctr/utils/metrics.html @@ -13,7 +13,7 @@ - + doctr.utils.metrics - docTR documentation @@ -932,7 +932,7 @@

Source code for doctr.utils.metrics

     
   
- + diff --git a/v0.9.0/_modules/doctr/utils/visualization.html b/v0.9.0/_modules/doctr/utils/visualization.html index 4f109e3da7..9e56c2f210 100644 --- a/v0.9.0/_modules/doctr/utils/visualization.html +++ b/v0.9.0/_modules/doctr/utils/visualization.html @@ -13,7 +13,7 @@ - + doctr.utils.visualization - docTR documentation @@ -716,7 +716,7 @@

Source code for doctr.utils.visualization

     
   
- + diff --git a/v0.9.0/_modules/index.html b/v0.9.0/_modules/index.html index de317b11c8..50a1a152f8 100644 --- a/v0.9.0/_modules/index.html +++ b/v0.9.0/_modules/index.html @@ -13,7 +13,7 @@ - + Overview: module code - docTR documentation @@ -374,7 +374,7 @@

All modules for which code is available

- + diff --git a/v0.9.0/_sources/datasets.rst.txt b/v0.9.0/_sources/datasets.rst.txt deleted file mode 100644 index 354122f1e5..0000000000 --- a/v0.9.0/_sources/datasets.rst.txt +++ /dev/null @@ -1,68 +0,0 @@ -doctr.datasets -============== - -.. currentmodule:: doctr.datasets - -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - -.. _datasets: - -Available Datasets ------------------- -The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. - -.. autoclass:: doctr.datasets.datasets.VisionDataset - - -Here are all datasets that are available through DocTR: - -.. autoclass:: FUNSD -.. autoclass:: SROIE -.. autoclass:: CORD -.. autoclass:: OCRDataset - - -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. - -.. autoclass:: doctr.datasets.loader.DataLoader - - -.. _vocabs: - -Supported Vocabs ----------------- - -Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs. - -.. list-table:: DocTR Vocabs - :widths: 20 5 50 - :header-rows: 1 - - * - Name - - size - - characters - * - digits - - 10 - - 0123456789 - * - ascii_letters - - 52 - - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - * - punctuation - - 32 - - !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ - * - currency - - 5 - - £€¥¢฿ - * - latin - - 96 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° - * - french - - 154 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ - -.. autofunction:: encode_sequences diff --git a/v0.9.0/_sources/documents.rst.txt b/v0.9.0/_sources/documents.rst.txt deleted file mode 100644 index 655730073e..0000000000 --- a/v0.9.0/_sources/documents.rst.txt +++ /dev/null @@ -1,87 +0,0 @@ -doctr.documents -=============== - - -.. currentmodule:: doctr.documents - -The documents module enables users to easily access content from documents and export analysis -results to structured formats. - - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.9.0/_sources/installing.rst.txt b/v0.9.0/_sources/installing.rst.txt deleted file mode 100644 index 5c8779dc1c..0000000000 --- a/v0.9.0/_sources/installing.rst.txt +++ /dev/null @@ -1,46 +0,0 @@ - -************ -Installation -************ - -This library requires Python 3.6 or higher. - - -Prerequisites -============= - -Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: - -* TensorFlow: `installation page `_. -* PyTorch: `installation page `_. - -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - -Via Python Package -================== - -Install the last stable release of the package using pip: - -.. code:: bash - - pip install python-doctr - - -Via Git -======= - -Install the library in developper mode: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - pip install -e doctr/. diff --git a/v0.9.0/_sources/models.rst.txt b/v0.9.0/_sources/models.rst.txt deleted file mode 100644 index 9830c6c153..0000000000 --- a/v0.9.0/_sources/models.rst.txt +++ /dev/null @@ -1,215 +0,0 @@ -doctr.models -============ - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. -* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- -Localizing text elements in images - -+---------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+==================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for detection -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for detection is the following: - -1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. -2. batch images together -3. normalize the batch using the training data statistics - - -Detection models -^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - -.. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.linknet16 - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. - -.. autofunction:: doctr.models.detection.detection_predictor - - -Text Recognition ----------------- -Identifying strings in images - -.. list-table:: Text recognition model zoo - :widths: 20 20 15 10 10 10 - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 86.02 - - 91.3 - - 12.8 - * - sar_vgg16_bn - - (32, 128, 3) - - 21.5M - - 86.2 - - 91.7 - - 3.3 - * - sar_resnet31 - - (32, 128, 3) - - 53.1M - - **86.3** - - **92.1** - - 2.7 - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Pre-processing for recognition -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In DocTR, the pre-processing scheme for recognition is the following: - -1. resize each input image to the target size (bilinear interpolation by default) without deformation. -2. pad the image to the target size (with zeros by default) -3. batch images together -4. normalize the batch using the training data statistics - -Recognition models -^^^^^^^^^^^^^^^^^^ -Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: - - -.. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_vgg16_bn -.. autofunction:: doctr.models.recognition.sar_resnet31 -.. autofunction:: doctr.models.recognition.master - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -Combining the right components around a given architecture for easier usage. - -.. autofunction:: doctr.models.recognition.recognition_predictor - - -End-to-End OCR --------------- -Predictors that localize and identify text elements in images - -+-----------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+=============================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. - -Results on private ocr datasets - -+------------------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | -+====================================+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. - -.. autofunction:: doctr.models.zoo.ocr_predictor - - -Model export ------------- -Utility functions to make the most of document analysis models. - -.. currentmodule:: doctr.models.export - -Model compression -^^^^^^^^^^^^^^^^^ - -.. autofunction:: convert_to_tflite - -.. autofunction:: convert_to_fp16 - -.. autofunction:: quantize_model - -Using SavedModel -^^^^^^^^^^^^^^^^ - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.9.0/_sources/transforms.rst.txt b/v0.9.0/_sources/transforms.rst.txt deleted file mode 100644 index 0230fe75f5..0000000000 --- a/v0.9.0/_sources/transforms.rst.txt +++ /dev/null @@ -1,32 +0,0 @@ -doctr.transforms -================ - -.. currentmodule:: doctr.transforms - -Data transformations are part of both training and inference procedure. Drawing inspiration from the design of `torchvision `_, we express transformations as composable modules. - - -Supported transformations -------------------------- -Here are all transformations that are available through DocTR: - -.. autoclass:: Resize -.. autoclass:: Normalize -.. autoclass:: LambdaTransformation -.. autoclass:: ToGray -.. autoclass:: ColorInversion -.. autoclass:: RandomBrightness -.. autoclass:: RandomContrast -.. autoclass:: RandomSaturation -.. autoclass:: RandomHue -.. autoclass:: RandomGamma -.. autoclass:: RandomJpegQuality - - -Composing transformations ---------------------------------------------- -It is common to require several transformations to be performed consecutively. - -.. autoclass:: Compose -.. autoclass:: OneOf -.. autoclass:: RandomApply diff --git a/v0.9.0/_sources/utils.rst.txt b/v0.9.0/_sources/utils.rst.txt deleted file mode 100644 index 69c1abe0eb..0000000000 --- a/v0.9.0/_sources/utils.rst.txt +++ /dev/null @@ -1,36 +0,0 @@ -doctr.utils -=========== - -This module regroups non-core features that are complementary to the rest of the package. - -.. currentmodule:: doctr.utils - - -Visualization -------------- -Easy-to-use functions to make sense of your model's predictions. - -.. currentmodule:: doctr.utils.visualization - -.. autofunction:: visualize_page - - -.. _metrics: - -Task evaluation ---------------- -Implementations of task-specific metrics to easily assess your model performances. - -.. currentmodule:: doctr.utils.metrics - -.. autoclass:: TextMatch - - .. automethod:: summary - -.. autoclass:: LocalizationConfusion - - .. automethod:: summary - -.. autoclass:: OCRMetric - - .. automethod:: summary diff --git a/v0.9.0/_static/basic.css b/v0.9.0/_static/basic.css index f316efcb47..7ebbd6d07b 100644 --- a/v0.9.0/_static/basic.css +++ b/v0.9.0/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { diff --git a/v0.9.0/_static/doctools.js b/v0.9.0/_static/doctools.js index 4d67807d17..0398ebb9f0 100644 --- a/v0.9.0/_static/doctools.js +++ b/v0.9.0/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/v0.9.0/_static/language_data.js b/v0.9.0/_static/language_data.js index 367b8ed81b..c7fe6c6faf 100644 --- a/v0.9.0/_static/language_data.js +++ b/v0.9.0/_static/language_data.js @@ -1,13 +1,6 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; diff --git a/v0.9.0/_static/searchtools.js b/v0.9.0/_static/searchtools.js index b08d58c9b9..2c774d17af 100644 --- a/v0.9.0/_static/searchtools.js +++ b/v0.9.0/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => { const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( @@ -138,7 +145,7 @@ const _displayNextItem = ( else _finishSearch(resultCount); }; // Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. // Order the results by score (in opposite order of appearance, since the // `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. const _orderResultsByScoreThenName = (a, b) => { @@ -248,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -318,7 +326,7 @@ const Search = { const indexEntries = Search._index.indexentries; // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. const normalResults = []; const nonMainIndexResults = []; @@ -337,6 +345,7 @@ const Search = { null, score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -354,6 +363,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.index, ]; if (isMain) { normalResults.push(result); @@ -475,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -585,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; diff --git a/v0.9.0/changelog.html b/v0.9.0/changelog.html index 48c6f3ae85..b3d57fe04a 100644 --- a/v0.9.0/changelog.html +++ b/v0.9.0/changelog.html @@ -14,7 +14,7 @@ - + Changelog - docTR documentation @@ -432,7 +432,7 @@

v0.1.0 (2021-03-05) - + diff --git a/v0.9.0/community/resources.html b/v0.9.0/community/resources.html index 2564037893..9a1988258c 100644 --- a/v0.9.0/community/resources.html +++ b/v0.9.0/community/resources.html @@ -14,7 +14,7 @@ - + Community resources - docTR documentation @@ -389,7 +389,7 @@

Community resources - + diff --git a/v0.9.0/contributing/code_of_conduct.html b/v0.9.0/contributing/code_of_conduct.html index 007a7a2e13..6c5bee3652 100644 --- a/v0.9.0/contributing/code_of_conduct.html +++ b/v0.9.0/contributing/code_of_conduct.html @@ -14,7 +14,7 @@ - + Contributor Covenant Code of Conduct - docTR documentation @@ -500,7 +500,7 @@

Attribution - + diff --git a/v0.9.0/contributing/contributing.html b/v0.9.0/contributing/contributing.html index 5e4ebc62fc..496f96f0f9 100644 --- a/v0.9.0/contributing/contributing.html +++ b/v0.9.0/contributing/contributing.html @@ -14,7 +14,7 @@ - + Contributing to docTR - docTR documentation @@ -477,7 +477,7 @@

Let’s connect - + diff --git a/v0.9.0/datasets.html b/v0.9.0/datasets.html deleted file mode 100644 index 193e576c57..0000000000 --- a/v0.9.0/datasets.html +++ /dev/null @@ -1,578 +0,0 @@ - - - - - - - - - - - - - doctr.datasets - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.datasets

-

Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time.

-
-

Available Datasets

-

The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

-
-
-class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
-
- -

Here are all datasets that are available through DocTR:

-
-
-class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

-
-
Example::
>>> from doctr.datasets import FUNSD
->>> train_set = FUNSD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

-
-
Example::
>>> from doctr.datasets import SROIE
->>> train_set = SROIE(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.CORD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

CORD dataset from “CORD: A Consolidated Receipt Dataset forPost-OCR Parsing”.

-
-
Example::
>>> from doctr.datasets import CORD
->>> train_set = CORD(train=True, download=True)
->>> img, target = train_set[0]
-
-
-
-
-
-
Parameters:
-
    -
  • train – whether the subset should be the training one

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
-

Implements an OCR dataset

-
-
Parameters:
-
    -
  • img_folder – local path to image folder (all jpg at the root)

  • -
  • label_file – local path to the label file

  • -
  • sample_transforms – composable transformations that will be applied to each image

  • -
  • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

  • -
  • **kwargs – keyword arguments from VisionDataset.

  • -
-
-
-
- -
-
-

Data Loading

-

Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

-
-
-class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]
-

Implements a dataset wrapper for fast data loading

-
-
Example::
>>> from doctr.datasets import FUNSD, DataLoader
->>> train_set = CORD(train=True, download=True)
->>> train_loader = DataLoader(train_set, batch_size=32)
->>> train_iter = iter(train_loader)
->>> images, targets = next(train_iter)
-
-
-
-
-
-
Parameters:
-
    -
  • dataset – the dataset

  • -
  • shuffle – whether the samples should be shuffled before passing it to the iterator

  • -
  • batch_size – number of elements in each batch

  • -
  • drop_last – if True, drops the last batch if it isn’t full

  • -
  • workers – number of workers to use for data loading

  • -
-
-
-
- -
-
-

Supported Vocabs

-

Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets -of vocabs.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DocTR Vocabs

Name

size

characters

digits

10

0123456789

ascii_letters

52

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

punctuation

32

!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

currency

5

£€¥¢฿

latin

96

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

french

154

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

-
-
-
-doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]
-

Encode character sequences using a given vocab as mapping

-
-
Parameters:
-
    -
  • sequences – the list of character sequences of size N

  • -
  • vocab – the ordered vocab to use for encoding

  • -
  • target_size – maximum length of the encoded data

  • -
  • eos – encoding of End Of String

  • -
  • sos – optional encoding of Start Of String

  • -
  • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

  • -
-
-
Returns:
-

the padded encoded data as a tensor

-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/documents.html b/v0.9.0/documents.html deleted file mode 100644 index 98cbb2c5ef..0000000000 --- a/v0.9.0/documents.html +++ /dev/null @@ -1,772 +0,0 @@ - - - - - - - - - - - - - doctr.documents - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.documents

-

The documents module enables users to easily access content from documents and export analysis -results to structured formats.

-
-

Document structure

-

Structural organization of the documents.

-
-

Word

-

A Word is an uninterrupted sequence of characters.

-
-
-class doctr.documents.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
-

Implements a word element

-
-
Parameters:
-
    -
  • value – the text string of the word

  • -
  • confidence – the confidence associated with the text prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

  • -
  • size (the page's)

  • -
-
-
-
- -
-
-

Line

-

A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

-
-
-class doctr.documents.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a line element as a collection of words

-
-
Parameters:
-
    -
  • words – list of word elements

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

  • -
-
-
-
- -
-
-

Artefact

-

An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

-
-
-class doctr.documents.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
-

Implements a non-textual element

-
-
Parameters:
-
    -
  • artefact_type – the type of artefact

  • -
  • confidence – the confidence of the type prediction

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

  • -
-
-
-
- -
-
-

Block

-

A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

-
-
-class doctr.documents.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
-

Implements a block element as a collection of lines and artefacts

-
-
Parameters:
-
    -
  • lines – list of line elements

  • -
  • artefacts – list of artefacts

  • -
  • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

  • -
-
-
-
- -
-
-

Page

-

A Page is a collection of Blocks that were on the same physical page.

-
-
-class doctr.documents.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
-

Implements a page element as a collection of blocks

-
-
Parameters:
-
    -
  • blocks – list of block elements

  • -
  • page_idx – the index of the page in the input raw document

  • -
  • dimensions – the page size in pixels in format (width, height)

  • -
  • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

  • -
  • language – a dictionary with the language value and confidence of the prediction

  • -
-
-
-
-
-show(page: ndarray, interactive: bool = True, **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-
    -
  • page – image encoded as a numpy array in uint8

  • -
  • interactive – whether the display should be interactive

  • -
-
-
-
- -
- -
-
-

Document

-

A Document is a collection of Pages.

-
-
-class doctr.documents.Document(pages: List[Page])[source]
-

Implements a document element as a collection of pages

-
-
Parameters:
-

pages – list of page elements

-
-
-
-
-show(pages: List[ndarray], **kwargs) None[source]
-

Overlay the result on a given image

-
-
Parameters:
-

pages – list of images encoded as numpy arrays in uint8

-
-
-
- -
- -
-
-
-

File reading

-

High-performance file reading and conversion to processable structured data.

-
-
-doctr.documents.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_pdf
->>> doc = read_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_img(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
-

Read an image file into numpy format

-
-
Example::
>>> from doctr.documents import read_img
->>> page = read_img("path/to/your/doc.jpg")
-
-
-
-
-
-
Parameters:
-
    -
  • file – the path to the image file

  • -
  • output_size – the expected output size of each page in format H x W

  • -
  • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

  • -
-
-
Returns:
-

the page decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-doctr.documents.read_html(url: str, **kwargs: Any) bytes[source]
-

Read a PDF file and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import read_html
->>> doc = read_html("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – URL of the target web page

-
-
Returns:
-

decoded PDF file as a bytes stream

-
-
-
- -
-
-class doctr.documents.DocumentFile[source]
-

Read a document from multiple extensions

-
-
-classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
-

Read a PDF file

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-
-
-
-
-
-
Parameters:
-

file – the path to the PDF file or a binary stream

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_url(url: str, **kwargs) PDF[source]
-

Interpret a web page as a PDF document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> doc = DocumentFile.from_url("https://www.yoursite.com")
-
-
-
-
-
-
Parameters:
-

url – the URL of the target web page

-
-
Returns:
-

a PDF document

-
-
-
- -
-
-classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
-

Read an image file (or a collection of image files) and convert it into an image in numpy format

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
-
-
-
-
-
-
Parameters:
-

files – the path to the image file or a binary stream, or a collection of those

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
- -
-
-class doctr.documents.PDF(doc: Document)[source]
-

PDF document template

-
-
Parameters:
-

doc – input PDF document

-
-
-
-
-as_images(**kwargs) List[ndarray][source]
-

Convert all document pages to images

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of convert_page_to_numpy

-
-
Returns:
-

the list of pages decoded as numpy ndarray of shape H x W x 3

-
-
-
- -
-
-get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
-

Get the annotations for all words in the document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
-
-
-
-
-
-
Parameters:
-

kwargs – keyword arguments of fitz.Page.getTextWords

-
-
Returns:
-

the list of pages annotations, represented as a list of tuple (bounding box, value)

-
-
-
- -
-
-get_artefacts() List[List[Tuple[float, float, float, float]]][source]
-

Get the artefacts for the entire document

-
-
Example::
>>> from doctr.documents import DocumentFile
->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
-
-
-
-
-
-
Returns:
-

the list of pages artefacts, represented as a list of bounding boxes

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/genindex.html b/v0.9.0/genindex.html index dfd304f0ff..34b60498eb 100644 --- a/v0.9.0/genindex.html +++ b/v0.9.0/genindex.html @@ -13,7 +13,7 @@ - Index - docTR documentation + Index - docTR documentation @@ -752,7 +752,7 @@

W

- + diff --git a/v0.9.0/getting_started/installing.html b/v0.9.0/getting_started/installing.html index 0ff133aeec..7645f38a83 100644 --- a/v0.9.0/getting_started/installing.html +++ b/v0.9.0/getting_started/installing.html @@ -14,7 +14,7 @@ - + Installation - docTR documentation @@ -431,7 +431,7 @@

Via Git - + diff --git a/v0.9.0/index.html b/v0.9.0/index.html index e60d5efd26..79b35e0b90 100644 --- a/v0.9.0/index.html +++ b/v0.9.0/index.html @@ -14,7 +14,7 @@ - + docTR documentation @@ -439,7 +439,7 @@

Supported datasets - + diff --git a/v0.9.0/installing.html b/v0.9.0/installing.html deleted file mode 100644 index b61c60134b..0000000000 --- a/v0.9.0/installing.html +++ /dev/null @@ -1,395 +0,0 @@ - - - - - - - - - - - - - Installation - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

Installation

-

This library requires Python 3.6 or higher.

-
-

Prerequisites

-

Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

- -

If you are running another OS than Linux, you will need a few extra dependencies.

-

For MacOS users, you can install them as follows:

-
brew install cairo pango gdk-pixbuf libffi
-
-
-

For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

-
-
-

Via Python Package

-

Install the last stable release of the package using pip:

-
pip install python-doctr
-
-
-
-
-

Via Git

-

Install the library in developper mode:

-
git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/models.html b/v0.9.0/models.html deleted file mode 100644 index b5cd44c9fa..0000000000 --- a/v0.9.0/models.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - doctr.models - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.models

-

The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

-

For a given task, DocTR provides a Predictor, which is composed of 2 components:

-
    -
  • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

  • -
  • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

  • -
-
-

Text Detection

-

Localizing text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Input shape

# params

Recall

Precision

Recall

Precision

FPS

db_resnet50

(1024, 1024, 3)

25.2 M

82.14

87.64

92.49

89.66

2.1

-
-

All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for detection

-

In DocTR, the pre-processing scheme for detection is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

  2. -
  3. batch images together

  4. -
  5. normalize the batch using the training data statistics

  6. -
-
-
-

Detection models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
-

DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a ResNet-50 backbone.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
-

LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import linknet16
->>> model = linknet16(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

-
-
Returns:
-

text detection architecture

-
-
-
- -
-
-

Detection predictors

-

Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

-
-
-doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
-

Text detection architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import detection_predictor
->>> model = detection_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_resnet50’)

  • -
  • pretrained – If True, returns a model pre-trained on our text detection dataset

  • -
-
-
Returns:
-

Detection predictor

-
-
-
- -
-
-
-

Text Recognition

-

Identifying strings in images

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Text recognition model zoo

Architecture

Input shape

# params

FUNSD

CORD

FPS

crnn_vgg16_bn

(32, 128, 3)

15.8M

86.02

91.3

12.8

sar_vgg16_bn

(32, 128, 3)

21.5M

86.2

91.7

3.3

sar_resnet31

(32, 128, 3)

53.1M

86.3

92.1

2.7

-
-

All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All these recognition models are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-
-

Pre-processing for recognition

-

In DocTR, the pre-processing scheme for recognition is the following:

-
    -
  1. resize each input image to the target size (bilinear interpolation by default) without deformation.

  2. -
  3. pad the image to the target size (with zeros by default)

  4. -
  5. batch images together

  6. -
  7. normalize the batch using the training data statistics

  8. -
-
-
-

Recognition models

-

Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

-
-
-doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
-

CRNN with a VGG-16 backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import crnn_vgg16_bn
->>> model = crnn_vgg16_bn(pretrained=True)
->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-
-
Example::
>>> import tensorflow as tf
->>> from doctr.models import sar_vgg16_bn
->>> model = sar_vgg16_bn(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.sar_resnet31(pretrained: bool = False, **kwargs: Any) SAR[source]
-

SAR with a resnet-31 feature extractor as described in “Show, Attend and Read:A Simple and Strong -Baseline for Irregular Text Recognition”.

-

Example

-
>>> import tensorflow as tf
->>> from doctr.models import sar_resnet31
->>> model = sar_resnet31(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
-

MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. -Example:

-
>>> import tensorflow as tf
->>> from doctr.models import master
->>> model = master(pretrained=False)
->>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
->>> out = model(input_tensor)
-
-
-
-
Parameters:
-

pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

-
-
Returns:
-

text recognition architecture

-
-
-
- -
-
-

Recognition predictors

-

Combining the right components around a given architecture for easier usage.

-
-
-doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
-

Text recognition architecture.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> model = recognition_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(32, 128, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

  • -
  • pretrained – If True, returns a model pre-trained on our text recognition dataset

  • -
-
-
Returns:
-

Recognition predictor

-
-
-
- -
-
-
-

End-to-End OCR

-

Predictors that localize and identify text elements in images

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FUNSD

CORD

Architecture

Recall

Precision

FPS

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

70.08

74.77

0.85

82.19

79.67

1.6

db_resnet50 + sar_vgg16_bn

N/A

N/A

0.49

N/A

N/A

1.0

db_resnet50 + sar_resnet31

N/A

N/A

0.27

N/A

N/A

0.83

Gvision text detection

59.50

62.50

75.30

70.00

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

66.00

-
-

All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

-

All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

-

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. -We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

-

Results on private ocr datasets

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.90

81.01

65.68

69.86

49.48

50.46

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

-
-
-

Two-stage approaches

-

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

-
-
-doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
-

End-to-end OCR architecture using one model for localization, and another for text recognition.

-
-
Example::
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([input_page])
-
-
-
-
-
-
Parameters:
-
    -
  • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

  • -
  • pretrained – If True, returns a model pre-trained on our OCR dataset

  • -
-
-
Returns:
-

OCR predictor

-
-
-
- -
-
-
-

Model export

-

Utility functions to make the most of document analysis models.

-
-

Model compression

-
-
-doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
-

Converts a model to TFLite format

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_tflite, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_tflite(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
-

Converts a model to half precision

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import convert_to_fp16, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = convert_to_fp16(model)
-
-
-
-
-
-
Parameters:
-

tf_model – a keras model

-
-
Returns:
-

the serialized FP16 model

-
-
Return type:
-

bytes

-
-
-
- -
-
-doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
-

Quantize a Tensorflow model

-
-
Example::
>>> from tensorflow.keras import Sequential
->>> from doctr.models import quantize_model, conv_sequence
->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
->>> serialized_model = quantize_model(model, (224, 224, 3))
-
-
-
-
-
-
Parameters:
-
    -
  • tf_model – a keras model

  • -
  • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

  • -
-
-
Returns:
-

the serialized quantized model

-
-
Return type:
-

bytes

-
-
-
- -
-
-

Using SavedModel

-

Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

-
>>> import tensorflow as tf
->>> from doctr.models import db_resnet50
->>> model = db_resnet50(pretrained=True)
->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
->>> _ = model(input_t, training=False)
->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
-
-
-

And loaded just as easily:

-
>>> import tensorflow as tf
->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
-
-
-
-
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/modules/contrib.html b/v0.9.0/modules/contrib.html index 08a6d7be12..8c5490014b 100644 --- a/v0.9.0/modules/contrib.html +++ b/v0.9.0/modules/contrib.html @@ -14,7 +14,7 @@ - + doctr.contrib - docTR documentation @@ -376,7 +376,7 @@

Supported contribution modules - + diff --git a/v0.9.0/modules/datasets.html b/v0.9.0/modules/datasets.html index 4bef8a382c..fd825b1a49 100644 --- a/v0.9.0/modules/datasets.html +++ b/v0.9.0/modules/datasets.html @@ -14,7 +14,7 @@ - + doctr.datasets - docTR documentation @@ -1058,7 +1058,7 @@

Returns: - + diff --git a/v0.9.0/modules/io.html b/v0.9.0/modules/io.html index b809300a22..ae9d1b8aff 100644 --- a/v0.9.0/modules/io.html +++ b/v0.9.0/modules/io.html @@ -14,7 +14,7 @@ - + doctr.io - docTR documentation @@ -756,7 +756,7 @@

Returns: - + diff --git a/v0.9.0/modules/models.html b/v0.9.0/modules/models.html index bb6930cf22..c1aaa3ad8a 100644 --- a/v0.9.0/modules/models.html +++ b/v0.9.0/modules/models.html @@ -14,7 +14,7 @@ - + doctr.models - docTR documentation @@ -1598,7 +1598,7 @@

Args: - + diff --git a/v0.9.0/modules/transforms.html b/v0.9.0/modules/transforms.html index 9be1b73323..ada5ec8d1e 100644 --- a/v0.9.0/modules/transforms.html +++ b/v0.9.0/modules/transforms.html @@ -14,7 +14,7 @@ - + doctr.transforms - docTR documentation @@ -831,7 +831,7 @@

Args:< - + diff --git a/v0.9.0/modules/utils.html b/v0.9.0/modules/utils.html index 65f59737ce..8d375d93c5 100644 --- a/v0.9.0/modules/utils.html +++ b/v0.9.0/modules/utils.html @@ -14,7 +14,7 @@ - + doctr.utils - docTR documentation @@ -711,7 +711,7 @@

Args: - + diff --git a/v0.9.0/notebooks.html b/v0.9.0/notebooks.html index 9c65c97b9d..dae441b209 100644 --- a/v0.9.0/notebooks.html +++ b/v0.9.0/notebooks.html @@ -14,7 +14,7 @@ - + docTR Notebooks - docTR documentation @@ -381,7 +381,7 @@

docTR Notebooks - + diff --git a/v0.9.0/py-modindex.html b/v0.9.0/py-modindex.html deleted file mode 100644 index c1569be607..0000000000 --- a/v0.9.0/py-modindex.html +++ /dev/null @@ -1,330 +0,0 @@ - - - - - - - - - - - Python Module Index - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
-
- -
- -
-
- -
-

Python Module Index

- -
- - - - - - - - - - - -
 
d
- doctr -
- -
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.9.0/search.html b/v0.9.0/search.html index c6b16a4f1a..8f039172f3 100644 --- a/v0.9.0/search.html +++ b/v0.9.0/search.html @@ -14,7 +14,7 @@ - + Search - docTR documentation @@ -336,7 +336,7 @@ - + diff --git a/v0.9.0/searchindex.js b/v0.9.0/searchindex.js index 94c6d7bf4e..9731d4328a 100644 --- a/v0.9.0/searchindex.js +++ b/v0.9.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[13, null]], "Advanced options": [[18, "advanced-options"]], "Args:": [[6, "args"], [6, "id4"], [6, "id7"], [6, "id10"], [6, "id13"], [6, "id16"], [6, "id19"], [6, "id22"], [6, "id25"], [6, "id29"], [6, "id32"], [6, "id37"], [6, "id40"], [6, "id46"], [6, "id49"], [6, "id50"], [6, "id51"], [6, "id54"], [6, "id57"], [6, "id60"], [6, "id61"], [7, "args"], [7, "id2"], [7, "id3"], [7, "id4"], [7, "id5"], [7, "id6"], [7, "id7"], [7, "id10"], [7, "id12"], [7, "id14"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id28"], [8, "args"], [8, "id3"], [8, "id8"], [8, "id13"], [8, "id17"], [8, "id21"], [8, "id26"], [8, "id31"], [8, "id36"], [8, "id41"], [8, "id46"], [8, "id50"], [8, "id54"], [8, "id59"], [8, "id63"], [8, "id68"], [8, "id73"], [8, "id77"], [8, "id81"], [8, "id85"], [8, "id90"], [8, "id95"], [8, "id99"], [8, "id104"], [8, "id109"], [8, "id114"], [8, "id119"], [8, "id123"], [8, "id127"], [8, "id132"], [8, "id137"], [8, "id142"], [8, "id146"], [8, "id150"], [8, "id155"], [8, "id159"], [8, "id163"], [8, "id167"], [8, "id169"], [8, "id171"], [8, "id173"], [9, "args"], [9, "id1"], [9, "id2"], [9, "id3"], [9, "id4"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"], [9, "id12"], [9, "id13"], [9, "id14"], [9, "id15"], [9, "id16"], [9, "id17"], [9, "id18"], [9, "id19"], [10, "args"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"]], "Artefact": [[7, "artefact"]], "ArtefactDetection": [[15, "artefactdetection"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[16, "available-datasets"]], "Available architectures": [[18, "available-architectures"], [18, "id1"], [18, "id2"]], "Available contribution modules": [[15, "available-contribution-modules"]], "Block": [[7, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[16, null]], "Choosing the right model": [[18, null]], "Classification": [[14, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[9, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[6, "custom-dataset-loader"]], "Data Loading": [[16, "data-loading"]], "Dataloader": [[6, "dataloader"]], "Detection": [[14, "detection"], [16, "detection"]], "Detection predictors": [[18, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[7, "document"]], "Document structure": [[7, "document-structure"]], "End-to-End OCR": [[18, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[17, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[7, "file-reading"]], "Half-precision": [[17, "half-precision"]], "Installation": [[3, null]], "Integrate contributions into your pipeline": [[15, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[7, "line"]], "Loading from Huggingface Hub": [[14, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[12, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[17, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[14, "naming-conventions"]], "Object Detection": [[16, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[7, "page"]], "Preparing your model for inference": [[17, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[14, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[14, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[14, "recognition"], [16, "recognition"]], "Recognition predictors": [[18, "recognition-predictors"]], "Returns:": [[6, "returns"], [7, "returns"], [7, "id11"], [7, "id13"], [7, "id15"], [7, "id19"], [7, "id23"], [7, "id27"], [7, "id31"], [8, "returns"], [8, "id6"], [8, "id11"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id29"], [8, "id34"], [8, "id39"], [8, "id44"], [8, "id49"], [8, "id53"], [8, "id57"], [8, "id62"], [8, "id66"], [8, "id71"], [8, "id76"], [8, "id80"], [8, "id84"], [8, "id88"], [8, "id93"], [8, "id98"], [8, "id102"], [8, "id107"], [8, "id112"], [8, "id117"], [8, "id122"], [8, "id126"], [8, "id130"], [8, "id135"], [8, "id140"], [8, "id145"], [8, "id149"], [8, "id153"], [8, "id158"], [8, "id162"], [8, "id166"], [8, "id168"], [8, "id170"], [8, "id172"], [10, "returns"]], "Scope": [[1, "scope"]], "Share your model with the community": [[14, null]], "Supported Vocabs": [[6, "supported-vocabs"]], "Supported contribution modules": [[5, "supported-contribution-modules"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[9, "supported-transformations"]], "Synthetic dataset generator": [[6, "synthetic-dataset-generator"], [16, "synthetic-dataset-generator"]], "Task evaluation": [[10, "task-evaluation"]], "Text Detection": [[18, "text-detection"]], "Text Recognition": [[18, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[12, null]], "Two-stage approaches": [[18, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[16, "use-your-own-datasets"]], "Using your ONNX exported model": [[17, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[3, "via-conda-only-for-linux"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[10, "visualization"]], "What should I do with the output?": [[18, "what-should-i-do-with-the-output"]], "Word": [[7, "word"]], "docTR Notebooks": [[11, null]], "docTR Vocabs": [[6, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.contrib": [[5, null]], "doctr.datasets": [[6, null], [6, "datasets"]], "doctr.io": [[7, null]], "doctr.models": [[8, null]], "doctr.models.classification": [[8, "doctr-models-classification"]], "doctr.models.detection": [[8, "doctr-models-detection"]], "doctr.models.factory": [[8, "doctr-models-factory"]], "doctr.models.recognition": [[8, "doctr-models-recognition"]], "doctr.models.zoo": [[8, "doctr-models-zoo"]], "doctr.transforms": [[9, null]], "doctr.utils": [[10, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[7, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[7, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[9, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[6, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[9, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[9, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[6, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[8, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[6, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[8, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[8, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[7, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[8, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[6, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[6, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[7, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[7, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[6, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[6, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[9, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[9, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[6, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[6, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[6, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[6, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[6, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[8, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[9, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[7, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[8, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[6, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[9, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[8, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[6, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[9, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[7, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[8, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[9, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[9, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[9, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[9, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[9, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[9, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[9, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[9, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[9, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[9, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[9, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[9, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[7, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[7, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[7, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[7, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[6, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[9, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[7, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[7, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[6, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[10, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[10, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[10, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[10, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[6, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[6, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[6, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[9, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[10, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[10, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[10, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[10, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[10, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[8, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[8, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[6, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[7, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[6, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[6, 0, 1, "", "CORD"], [6, 0, 1, "", "CharacterGenerator"], [6, 0, 1, "", "DetectionDataset"], [6, 0, 1, "", "DocArtefacts"], [6, 0, 1, "", "FUNSD"], [6, 0, 1, "", "IC03"], [6, 0, 1, "", "IC13"], [6, 0, 1, "", "IIIT5K"], [6, 0, 1, "", "IIITHWS"], [6, 0, 1, "", "IMGUR5K"], [6, 0, 1, "", "MJSynth"], [6, 0, 1, "", "OCRDataset"], [6, 0, 1, "", "RecognitionDataset"], [6, 0, 1, "", "SROIE"], [6, 0, 1, "", "SVHN"], [6, 0, 1, "", "SVT"], [6, 0, 1, "", "SynthText"], [6, 0, 1, "", "WILDRECEIPT"], [6, 0, 1, "", "WordGenerator"], [6, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[6, 0, 1, "", "DataLoader"]], "doctr.io": [[7, 0, 1, "", "Artefact"], [7, 0, 1, "", "Block"], [7, 0, 1, "", "Document"], [7, 0, 1, "", "DocumentFile"], [7, 0, 1, "", "Line"], [7, 0, 1, "", "Page"], [7, 0, 1, "", "Word"], [7, 1, 1, "", "decode_img_as_tensor"], [7, 1, 1, "", "read_html"], [7, 1, 1, "", "read_img_as_numpy"], [7, 1, 1, "", "read_img_as_tensor"], [7, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[7, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[7, 2, 1, "", "from_images"], [7, 2, 1, "", "from_pdf"], [7, 2, 1, "", "from_url"]], "doctr.io.Page": [[7, 2, 1, "", "show"]], "doctr.models": [[8, 1, 1, "", "kie_predictor"], [8, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[8, 1, 1, "", "crop_orientation_predictor"], [8, 1, 1, "", "magc_resnet31"], [8, 1, 1, "", "mobilenet_v3_large"], [8, 1, 1, "", "mobilenet_v3_large_r"], [8, 1, 1, "", "mobilenet_v3_small"], [8, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [8, 1, 1, "", "mobilenet_v3_small_page_orientation"], [8, 1, 1, "", "mobilenet_v3_small_r"], [8, 1, 1, "", "page_orientation_predictor"], [8, 1, 1, "", "resnet18"], [8, 1, 1, "", "resnet31"], [8, 1, 1, "", "resnet34"], [8, 1, 1, "", "resnet50"], [8, 1, 1, "", "textnet_base"], [8, 1, 1, "", "textnet_small"], [8, 1, 1, "", "textnet_tiny"], [8, 1, 1, "", "vgg16_bn_r"], [8, 1, 1, "", "vit_b"], [8, 1, 1, "", "vit_s"]], "doctr.models.detection": [[8, 1, 1, "", "db_mobilenet_v3_large"], [8, 1, 1, "", "db_resnet50"], [8, 1, 1, "", "detection_predictor"], [8, 1, 1, "", "fast_base"], [8, 1, 1, "", "fast_small"], [8, 1, 1, "", "fast_tiny"], [8, 1, 1, "", "linknet_resnet18"], [8, 1, 1, "", "linknet_resnet34"], [8, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[8, 1, 1, "", "from_hub"], [8, 1, 1, "", "login_to_hub"], [8, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[8, 1, 1, "", "crnn_mobilenet_v3_large"], [8, 1, 1, "", "crnn_mobilenet_v3_small"], [8, 1, 1, "", "crnn_vgg16_bn"], [8, 1, 1, "", "master"], [8, 1, 1, "", "parseq"], [8, 1, 1, "", "recognition_predictor"], [8, 1, 1, "", "sar_resnet31"], [8, 1, 1, "", "vitstr_base"], [8, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[9, 0, 1, "", "ChannelShuffle"], [9, 0, 1, "", "ColorInversion"], [9, 0, 1, "", "Compose"], [9, 0, 1, "", "GaussianBlur"], [9, 0, 1, "", "GaussianNoise"], [9, 0, 1, "", "LambdaTransformation"], [9, 0, 1, "", "Normalize"], [9, 0, 1, "", "OneOf"], [9, 0, 1, "", "RandomApply"], [9, 0, 1, "", "RandomBrightness"], [9, 0, 1, "", "RandomContrast"], [9, 0, 1, "", "RandomCrop"], [9, 0, 1, "", "RandomGamma"], [9, 0, 1, "", "RandomHorizontalFlip"], [9, 0, 1, "", "RandomHue"], [9, 0, 1, "", "RandomJpegQuality"], [9, 0, 1, "", "RandomResize"], [9, 0, 1, "", "RandomRotate"], [9, 0, 1, "", "RandomSaturation"], [9, 0, 1, "", "RandomShadow"], [9, 0, 1, "", "Resize"], [9, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[10, 0, 1, "", "DetectionMetric"], [10, 0, 1, "", "LocalizationConfusion"], [10, 0, 1, "", "OCRMetric"], [10, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.visualization": [[10, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 7, 8, 10, 14, 17], "0": [1, 3, 6, 9, 10, 12, 15, 16, 18], "00": 18, "01": 18, "0123456789": 6, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 6, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 6, "02": [], "02562": 8, "03": 18, "035": 18, "0361328125": 18, "04": 18, "05": 18, "06": 18, "06640625": 18, "07": 18, "08": [9, 18], "09": 18, "0966796875": 18, "1": [3, 6, 7, 8, 9, 10, 12, 16, 18], "10": [6, 10, 18], "100": [6, 9, 10, 16, 18], "1000": 18, "101": 6, "1024": [8, 12, 18], "104": 6, "106": 6, "108": 6, "1095": 16, "11": 18, "110": 10, "1107": 16, "114": 6, "115": [], "1156": 16, "116": 6, "118": 6, "11800h": 18, "11th": 18, "12": [3, 18], "120": 6, "123": 6, "126": 6, "1268": 16, "128": [8, 12, 17, 18], "13": 18, "130": 6, "13068": 16, "131": 6, "1337891": 16, "1357421875": 18, "1396484375": 18, "14": 18, "1420": 18, "14470v1": 6, "149": 16, "15": 18, "150": [10, 18], "154": [], "1552": 18, "16": [8, 17, 18], "160": [], "1630859375": 18, "1684": 18, "16x16": 8, "17": 18, "1778": 18, "1782": 18, "18": [8, 18], "185546875": 18, "19": [], "1900": 18, "1910": 8, "19342": 16, "19370": 16, "195": 6, "19598": 16, "199": 18, "1999": 18, "1m": [], "2": [3, 4, 6, 7, 9, 15, 18], "20": 18, "200": 10, "2000": 16, "2003": [4, 6], "2012": 6, "2013": [4, 6], "2015": 6, "2019": 4, "2021": [], "2023": [], "207901": 16, "21": 18, "2103": 6, "2186": 16, "21888": 16, "22": 18, "224": [8, 9], "225": 9, "22672": 16, "229": [9, 16], "23": 18, "233": 16, "234": 6, "236": [], "24": 18, "246": 16, "249": 16, "25": 18, "2504": 18, "255": [7, 8, 9, 10, 18], "256": 8, "257": 16, "26": 18, "26032": 16, "264": 12, "27": 18, "2700": 16, "2710": 18, "2749": 12, "28": 18, "287": 12, "29": 18, "296": 12, "299": 12, "2d": 18, "3": [3, 4, 7, 8, 9, 10, 17, 18], "30": 18, "300": 16, "3000": 16, "301": 12, "30595": 18, "30ghz": 18, "31": 8, "32": [6, 8, 9, 12, 16, 17, 18], "3232421875": 18, "33": [9, 18], "33402": 16, "33608": 16, "34": [8, 18], "340": 18, "3456": 18, "35": [], "3515625": 18, "36": 18, "360": 16, "37": [6, 18], "38": 18, "39": 18, "4": [8, 9, 10, 18], "40": 18, "406": 9, "41": 18, "42": 18, "43": 18, "44": 18, "45": 18, "456": 9, "46": 18, "47": 18, "472": 16, "48": [6, 18], "485": 9, "49": 18, "49377": 16, "5": [6, 9, 10, 15, 18], "50": [8, 16, 18], "51": 18, "51171875": 18, "512": 8, "52": [6, 18], "529": 18, "53": 18, "533": [], "54": 18, "540": 18, "5478515625": 18, "55": 18, "56": 18, "57": 18, "58": 18, "580": 18, "5810546875": 18, "583": 18, "59": 18, "595": [], "597": 18, "5k": [4, 6], "5m": 18, "6": [9, 18], "60": 9, "600": [8, 10, 18], "61": 18, "611": [], "62": 18, "625": [], "626": 16, "629": [], "63": 18, "630": [], "64": [8, 9, 18], "640": [], "641": 18, "647": 16, "65": 18, "66": 18, "660": [], "664": [], "666": [], "67": 18, "672": [], "68": 18, "689": [], "69": 18, "693": 12, "694": 12, "695": 12, "6m": 18, "7": 18, "70": [6, 10, 18], "700": [], "701": [], "702": [], "707470": 16, "71": [6, 18], "7100000": 16, "713": [], "7141797": 16, "7149": 16, "72": 18, "72dpi": 7, "73": 18, "73257": 16, "733": [], "74": 18, "745": [], "75": [9, 18], "753": [], "7581382": 16, "76": 18, "77": 18, "772": 12, "772875": 16, "78": 18, "780": [], "781": [], "783": [], "785": 12, "789": [], "79": 18, "793533": 16, "796": 16, "798": 12, "7m": 18, "8": [8, 9, 18], "80": 18, "800": [8, 10, 16, 18], "81": 18, "817": [], "82": 18, "8275l": [], "83": 18, "830": [], "84": 18, "849": 16, "85": 18, "8564453125": 18, "857": 18, "85875": 16, "86": 18, "860": [], "8603515625": 18, "862": [], "863": [], "87": 18, "8707": 16, "875": [], "88": 18, "89": 18, "8m": [], "9": [3, 9, 18], "90": 18, "90k": 6, "90kdict32px": 6, "91": 18, "913": [], "914085328578949": 18, "917": [], "92": 18, "921": [], "93": 18, "94": [6, 18], "95": [10, 18], "9578408598899841": 18, "96": 18, "97": 18, "98": 18, "99": 18, "9949972033500671": 18, "A": [1, 2, 4, 6, 7, 8, 11, 17], "And": [], "As": 2, "Be": 18, "Being": 1, "By": 13, "For": [1, 2, 3, 12, 18], "If": [2, 7, 8, 12, 18], "In": [2, 6, 16], "It": [9, 14, 15, 17], "Its": [4, 8], "No": [1, 18], "Of": 6, "Or": [15, 17], "The": [1, 2, 6, 7, 10, 13, 15, 17, 18], "Then": 8, "To": [2, 3, 13, 14, 15, 17, 18], "_": [1, 6, 8], "__call__": 18, "_build": 2, "_i": 10, "ab": 6, "abc": 17, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 6, "abdef": [6, 16], "abl": [16, 18], "about": [1, 16, 18], "abov": 18, "abstract": [], "abstractdataset": 6, "abus": 1, "accent": [], "accept": 1, "access": [4, 7, 16, 18], "account": [1, 14], "accur": 18, "accuraci": 10, "achiev": 17, "act": 1, "action": 1, "activ": 4, "ad": [2, 8, 9], "adapt": 1, "add": [9, 10, 14, 18], "add_hook": 18, "add_label": 10, "addit": [2, 3, 7, 15], "addition": [2, 18], "address": [1, 7], "adjust": 9, "advanc": 1, "advantag": 17, "advis": 2, "aesthet": [4, 6], "affect": 1, "after": [14, 18], "ag": 1, "again": 8, "aggreg": [10, 16], "aggress": 1, "align": [1, 7, 9], "all": [1, 2, 5, 6, 7, 9, 10, 15, 16, 18], "allow": [1, 17], "along": 18, "alreadi": [2, 17], "also": [1, 8, 14, 15, 16, 18], "alwai": 16, "an": [1, 2, 4, 6, 7, 8, 10, 15, 17, 18], "analysi": [7, 15], "ancient_greek": 6, "andrej": [], "angl": [7, 9], "ani": [1, 6, 7, 8, 9, 10, 17, 18], "annot": 6, "anot": 16, "anoth": [8, 12, 16], "answer": 1, "anyascii": 10, "anyon": 4, "anyth": 15, "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 6, 9], "applic": [4, 8], "appoint": 1, "appreci": 14, "appropri": [1, 2, 18], "ar": [1, 2, 3, 5, 6, 7, 9, 10, 11, 15, 16, 18], "arab": 6, "arabic_diacrit": 6, "arabic_lett": 6, "arabic_punctu": 6, "arbitrarili": [4, 8], "arch": [8, 14], "architectur": [4, 8, 14, 15], "archiv": [], "area": 18, "arg": [], "argument": [6, 7, 8, 10, 18], "around": 1, "arrai": [7, 9, 10], "art": [4, 15], "artefact": [10, 11, 15, 18], "artefact_typ": 7, "articl": [], "artifici": [4, 6], "arxiv": [6, 8], "as_imag": [], "asarrai": 10, "ascii_lett": 6, "aspect": [4, 8, 9, 18], "assess": 10, "assign": 10, "associ": 7, "assum": 8, "assume_straight_pag": [8, 18], "astyp": [8, 10, 18], "attack": 1, "attend": [4, 8], "attent": [1, 8], "autoclass": [], "autom": 4, "automat": 18, "autoregress": [4, 8], "avail": [1, 4, 5, 9], "averag": [9, 18], "avoid": [1, 3], "aw": [4, 18], "awar": 18, "azur": 18, "b": [8, 10, 18], "b_j": 10, "back": 2, "backbon": 8, "backend": 18, "background": 16, "bangla": 6, "bar": 15, "bar_cod": 16, "baranovskij": [], "base": [4, 8, 15], "baselin": [4, 8, 18], "batch": [6, 8, 9, 15, 16, 18], "batch_siz": [6, 12, 15, 16, 17], "bblanchon": 3, "bbox": 18, "becaus": 13, "been": [2, 10, 16, 18], "befor": [6, 8, 9, 18], "begin": 10, "behavior": [1, 18], "being": [10, 18], "belong": 18, "benchmark": 18, "best": 1, "beta": [], "better": [11, 18], "between": [9, 10, 18], "bgr": 7, "bilinear": 9, "bin_thresh": 18, "binar": [4, 8, 18], "binari": [7, 17, 18], "bit": 17, "blank": [], "block": [10, 18], "block_1_1": 18, "blue": [], "blur": 9, "bmvc": 6, "bn": 14, "bodi": [1, 18], "bool": [6, 7, 8, 9, 10], "boolean": [8, 18], "both": [4, 6, 9, 16, 18], "bottom": [8, 18], "bound": [6, 7, 8, 9, 10, 15, 18], "box": [6, 7, 8, 9, 10, 15, 16, 18], "box_thresh": 18, "brew": [], "bright": 9, "broadcast": [], "browser": [2, 4], "build": [2, 3, 17], "built": 2, "byte": [7, 18], "c": [3, 7, 10], "c5": [], "c_j": 10, "cach": [2, 6, 13], "cache_sampl": 6, "cairo": [], "call": 17, "callabl": [6, 9], "can": [2, 3, 12, 13, 14, 15, 16, 18], "capabl": [2, 11, 18], "case": [6, 10], "cf": 18, "cfg": 18, "challeng": 6, "challenge2_test_task12_imag": 6, "challenge2_test_task1_gt": 6, "challenge2_training_task12_imag": 6, "challenge2_training_task1_gt": 6, "chang": [13, 18], "changelog": [], "channel": [1, 2, 7, 9], "channel_prior": 3, "channelshuffl": 9, "charact": [4, 6, 7, 10, 16, 18], "charactergener": [6, 16], "characterist": 1, "charg": 18, "charset": 18, "chart": 7, "check": [2, 14, 18], "checkpoint": 8, "chip": 3, "christian": [], "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 6, 7, 9, 10, 18], "class_nam": 12, "classif": 16, "classif_mobilenet_v3_smal": [], "classmethod": 7, "clear": 2, "clone": 3, "close": 2, "co": 14, "code": [4, 7, 15], "codecov": 2, "colab": 11, "collate_fn": 6, "collect": [7, 15], "color": 9, "colorinvers": 9, "column": 7, "com": [1, 3, 7, 8, 14], "combin": 18, "come": [], "command": [2, 15], "comment": 1, "commit": 1, "common": [1, 9, 10, 17], "commun": 1, "compar": 4, "comparison": [10, 18], "competit": 6, "compil": [11, 18], "complaint": 1, "complementari": 10, "complet": 2, "compon": 18, "compos": [6, 18], "comprehens": 18, "comput": [6, 10, 17, 18], "conf_threshold": 15, "confid": [7, 18], "config": [3, 8], "configur": 8, "confus": 10, "consecut": [9, 18], "consequ": 1, "consid": [1, 2, 6, 7, 10, 18], "consist": 18, "consolid": [4, 6], "constant": 9, "construct": 1, "consum": [], "contact": 1, "contain": [5, 6, 16], "content": [6, 7, 18], "context": 8, "contib": 3, "continu": 1, "contrast": 9, "contrast_factor": 9, "contrib": [3, 15], "contribut": 1, "contributor": 2, "conv_sequ": [], "convers": 7, "convert": [7, 9], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 8, "cool": [], "coordin": [7, 18], "cord": [4, 6, 16, 18], "core": [10, 18], "corner": 18, "correct": 9, "correspond": [3, 7, 9, 18], "could": [1, 15], "counterpart": 10, "cover": 2, "coverag": 2, "cpu": [4, 12, 17], "creat": 14, "crnn": [4, 8, 14], "crnn_mobilenet_v3_larg": [8, 14, 18], "crnn_mobilenet_v3_smal": [8, 17, 18], "crnn_resnet31": [], "crnn_vgg16_bn": [8, 12, 14, 18], "crop": [7, 8, 9, 16, 18], "crop_orient": [7, 18], "crop_orientation_predictor": 8, "crop_param": [], "croporientationpredictor": [], "cuda": 17, "currenc": 6, "current": [2, 18], "custom": [14, 15, 17, 18], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": 18, "cvit": 4, "czczup": 8, "czech": 6, "d": [6, 16], "daili": [], "danish": 6, "data": [4, 6, 7, 9, 10, 12, 14], "dataload": 16, "dataset": [8, 12, 18], "dataset_info": 6, "date": [12, 18], "db": 14, "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [8, 14, 18], "db_resnet34": 18, "db_resnet50": [8, 12, 14, 18], "db_resnet50_rot": [], "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 8], "deal": [], "decis": 1, "decod": 7, "decode_img_as_tensor": 7, "dedic": 17, "deem": 1, "deep": [8, 18], "def": 18, "default": [3, 7, 12, 13, 18], "defer": 16, "defin": [10, 17], "deform": [], "degre": [7, 9], "degress": 7, "delet": 2, "delimit": 18, "delta": 9, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4, 18], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": 8, "descript": 11, "design": 9, "desir": 7, "det_arch": [8, 12, 14, 17], "det_b": 18, "det_model": [12, 14, 17], "det_param": 12, "det_predictor": [12, 18], "detail": [12, 18], "detect": [6, 7, 10, 11, 12, 15], "detect_languag": 8, "detect_orient": 8, "detection_predictor": [8, 18], "detection_task": [], "detectiondataset": [6, 16], "detectionmetr": 10, "detectionpredictor": [8, 12], "detector": [4, 8, 15], "deterior": 8, "determin": 1, "dev": [2, 13], "develop": 3, "developp": [], "deviat": 9, "devic": 17, "dict": [7, 10, 18], "dictionari": [7, 10], "differ": 1, "differenti": [4, 8], "digit": [4, 6, 16], "dimens": [7, 10, 18], "dimension": 9, "direct": 6, "directli": [14, 18], "directori": [2, 13], "disabl": [1, 13, 18], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 18, "discuss": 2, "disk": [], "disparag": 1, "displai": [7, 10], "display_artefact": 10, "distanc": [], "distribut": 9, "div": 18, "divers": 1, "divid": 7, "do": [2, 3, 8], "doc": [2, 7, 15, 17, 18], "docartefact": [6, 16], "docstr": 2, "doctr": [3, 12, 13, 14, 15, 16, 17, 18], "doctr_cache_dir": 13, "doctr_multiprocessing_dis": 13, "document": [6, 8, 10, 11, 15, 16, 17, 18], "documentbuild": 18, "documentfil": [7, 14, 15, 17], "doesn": 17, "don": [12, 18], "done": 9, "download": [6, 16], "downsiz": 8, "draw": 9, "draw_proba": [], "drop": 6, "drop_last": 6, "dtype": [7, 8, 9, 10, 17], "dual": [4, 6], "dummi": 14, "dummy_img": 18, "dummy_input": 17, "dure": 1, "dutch": 6, "dynam": [6, 15], "dynamic_seq_length": 6, "e": [1, 2, 3, 7, 8], "each": [4, 6, 7, 8, 9, 10, 16, 18], "eas": 2, "easi": [4, 10, 14, 17], "easier": [], "easili": [7, 10, 12, 14, 16, 18], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 6, 8], "either": [10, 18], "element": [6, 7, 8, 18], "els": [2, 15], "email": 1, "empathi": 1, "en": 18, "enabl": [6, 7], "enclos": 7, "encod": [4, 6, 7, 8, 18], "encode_sequ": 6, "encount": 2, "encrypt": 7, "end": [4, 6, 8, 10], "english": [6, 16], "enough": [2, 18], "ensur": 2, "entir": [], "entri": 6, "environ": [1, 13], "eo": 6, "equiv": 18, "error": [], "estim": 8, "etc": [7, 15], "ethnic": 1, "evalu": [16, 18], "event": 1, "everyon": 1, "everyth": [2, 18], "exact": [10, 18], "exactmatch": [], "exampl": [1, 2, 4, 6, 8, 14, 18], "exchang": 17, "exclud": [], "execut": 18, "exist": 14, "expand": 9, "expect": [7, 9, 10], "experi": 1, "explan": [1, 18], "explicit": 1, "exploit": [4, 8], "export": [7, 8, 10, 11, 15, 18], "export_as_straight_box": [8, 18], "export_as_xml": 18, "export_model_to_onnx": 17, "express": [1, 9], "extens": 7, "extern": [1, 16], "extra": [], "extract": [4, 6], "extract_arch": [], "extractor": 8, "f_": 10, "f_a": 10, "factor": 9, "fair": 1, "fairli": 1, "fals": [6, 7, 8, 9, 10, 12, 18], "famili": [], "faq": 1, "fascan": 14, "fast": [4, 6, 8], "fast_bas": [8, 18], "fast_smal": [8, 18], "fast_tini": [8, 18], "faster": [4, 8, 17], "fasterrcnn_mobilenet_v3_large_fpn": 8, "favorit": 18, "featur": [3, 8, 10, 11, 15], "feed": [], "feedback": 1, "feel": [2, 14], "felix92": 14, "few": [17, 18], "figsiz": 10, "figur": [10, 15], "file": [2, 6], "file_hash": [], "file_nam": [], "final": 8, "find": [2, 16], "fine": [], "finnish": 6, "first": [2, 6], "firsthand": 6, "fit": [8, 18], "fitz": [], "flag": 18, "flexibl": [], "flip": 9, "float": [7, 9, 10, 17], "float32": [7, 8, 9, 17], "fn": 9, "focu": 14, "focus": [1, 6], "folder": 6, "follow": [1, 2, 3, 6, 9, 10, 12, 13, 14, 15, 18], "font": 6, "font_famili": 6, "font_siz": [], "foral": 10, "forc": 2, "forg": 3, "form": [4, 6, 18], "format": [7, 10, 12, 16, 17, 18], "forpost": [4, 6], "forum": 2, "found": [], "fp": [], "fp16": 17, "frac": 10, "frame": [], "framework": [3, 14, 16, 18], "free": [1, 2, 14], "french": [6, 12, 14, 18], "friendli": 4, "from": [1, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18], "from_hub": [8, 14], "from_imag": [7, 14, 15, 17], "from_pdf": 7, "from_url": 7, "full": [6, 10, 18], "fulli": [], "function": [6, 9, 10, 15], "funsd": [4, 6, 16, 18], "further": 16, "futur": 6, "g": [7, 8], "g_": 10, "g_x": 10, "gallagh": [], "gamma": 9, "gaussian": 9, "gaussianblur": 9, "gaussiannois": 9, "gdk": [], "gen": 18, "gender": 1, "gener": [2, 4, 7, 8], "generic_cyrillic_lett": [], "geometri": [4, 7, 18], "geq": 10, "german": [6, 12, 14], "get": [17, 18], "get_artefact": [], "get_word": [], "gettextword": [], "git": 14, "github": [2, 3, 8, 14], "give": [1, 15], "given": [6, 7, 9, 10, 18], "global": 8, "go": 18, "good": 17, "googl": 2, "googlevis": 4, "gpu": [4, 15, 17], "gracefulli": 1, "graph": [4, 6, 7], "grayscal": 9, "ground": 10, "groung": 10, "group": [4, 18], "gt": 10, "gt_box": 10, "gt_label": 10, "gtk": [], "guid": 2, "guidanc": 16, "gvision": 18, "h": [7, 8, 9], "h_": 10, "ha": [2, 6, 10, 16], "half": [], "handl": [16, 18], "handwrit": 6, "handwritten": 16, "harass": 1, "hardwar": 18, "harm": 1, "hat": 10, "have": [1, 2, 10, 12, 14, 16, 17, 18], "head": [8, 18], "healthi": 1, "hebrew": 6, "height": [7, 9], "hello": [10, 18], "help": 17, "here": [5, 9, 11, 15, 16, 18], "hf": 8, "hf_hub_download": 8, "high": 7, "higher": [3, 6, 18], "hindi": 6, "hindi_digit": 6, "hocr": 18, "homebrew": [], "hook": 18, "horizont": [7, 9], "hous": 6, "how": [2, 12, 14, 16], "howev": 16, "hsv": 9, "html": [1, 2, 3, 7, 18], "http": [1, 3, 6, 7, 8, 14, 18], "hub": 8, "hue": 9, "huggingfac": 8, "hw": 6, "i": [1, 2, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17], "i7": 18, "ibrahimov": [], "ic03": [4, 6, 16], "ic13": [4, 6, 16], "icdar": [4, 6], "icdar2019": 6, "id": 18, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 6], "iiit5k": [6, 16], "iiithw": [4, 6, 16], "imag": [4, 6, 7, 8, 9, 10, 14, 15, 16, 18], "imagenet": 8, "imageri": 1, "images_90k_norm": 6, "img": [6, 9, 16, 17], "img_cont": 7, "img_fold": [6, 16], "img_path": 7, "img_transform": 6, "imgur5k": [4, 6, 16], "imgur5k_annot": 6, "imlist": 6, "impact": 1, "implement": [6, 7, 8, 9, 10, 18], "import": [6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18], "improv": 8, "inappropri": 1, "incid": 1, "includ": [1, 6, 16, 17], "inclus": 1, "increas": 9, "independ": 9, "index": [2, 7], "indic": 10, "individu": 1, "infer": [4, 8, 9, 15], "inform": [1, 2, 4, 6, 16], "inherit": [], "input": [2, 7, 8, 9, 17, 18], "input_crop": 8, "input_pag": [8, 10, 18], "input_shap": 17, "input_t": [], "input_tensor": 8, "inspir": [1, 9], "instal": [14, 15, 17], "instanc": [1, 18], "instanti": [8, 18], "instead": [6, 7, 8], "insult": 1, "int": [6, 7, 9], "int64": 10, "integ": 10, "integr": [4, 14, 16], "intel": 18, "interact": [1, 7, 10], "interfac": [14, 17], "interoper": 17, "interpol": 9, "interpret": [6, 7], "intersect": 10, "invert": 9, "investig": 1, "invis": 1, "invoic": [], "involv": [1, 18], "io": [14, 15, 17], "iou": 10, "iou_thresh": 10, "iou_threshold": 15, "irregular": [4, 8, 16], "isn": 6, "issu": [1, 2, 14], "italian": 6, "iter": [6, 9, 16, 18], "its": [7, 8, 9, 10, 16, 18], "itself": [8, 14], "j": 10, "jame": [], "job": 2, "join": 2, "jpeg": 9, "jpegqual": 9, "jpg": [6, 7, 14, 17], "json": [6, 16, 18], "json_output": 18, "jump": 2, "just": 1, "kei": [4, 6], "kera": [8, 17], "kernel": [4, 8, 9], "kernel_s": [], "kernel_shap": 9, "keywoard": 8, "keyword": [6, 7, 8, 10], "kie": [8, 12], "kie_predictor": [8, 12], "kiepredictor": 8, "kind": 1, "know": [2, 17], "kwarg": [6, 7, 8, 10], "l": 10, "l_j": 10, "label": [6, 10, 15, 16], "label_fil": [6, 16], "label_fold": 6, "label_path": [6, 16], "labels_path": [6, 16], "ladder": 1, "lambda": 9, "lambdatransform": 9, "lang": 18, "languag": [1, 4, 6, 7, 8, 14, 18], "larg": [8, 14], "largest": 10, "last": [3, 6], "latenc": 8, "later": 2, "latest": 18, "latin": 6, "layer": 17, "layout": 18, "lead": 1, "leader": 1, "learn": [1, 4, 8, 17, 18], "least": 3, "left": [10, 18], "legacy_french": 6, "length": [6, 18], "less": [17, 18], "let": [], "letter": [], "level": [1, 6, 10, 18], "levenshtein": [], "leverag": 11, "lf": 14, "libffi": [], "librari": [2, 3, 11, 12], "light": 4, "lightweight": 17, "like": 1, "limits_": 10, "line": [4, 8, 10, 18], "line_1_1": 18, "link": 12, "linknet": [4, 8], "linknet16": [], "linknet_resnet18": [8, 12, 17, 18], "linknet_resnet18_rot": [], "linknet_resnet34": [8, 17, 18], "linknet_resnet50": [8, 18], "linux": [], "list": [6, 7, 9, 10, 14], "ll": 10, "load": [4, 6, 8, 15, 17], "load_state_dict": 12, "load_weight": 12, "loader": [], "loc_pr": 18, "local": [2, 4, 6, 8, 10, 16, 18], "localis": 6, "localizationconfus": 10, "locat": [2, 7, 18], "login": 8, "login_to_hub": [8, 14], "logo": [7, 15, 16], "love": 14, "lower": [9, 10, 18], "m": [2, 10, 18], "m1": 3, "macbook": 3, "machin": 17, "maco": [], "made": 4, "magc_resnet31": 8, "mai": [1, 2], "mail": 1, "main": 11, "maintain": 4, "mainten": 2, "make": [1, 2, 10, 13, 14, 17, 18], "mani": [16, 18], "manipul": 18, "map": [6, 8], "map_loc": 12, "mask_shap": [], "master": [4, 8, 18], "match": [10, 18], "mathcal": 10, "matplotlib": [7, 10], "max": [6, 9, 10], "max_angl": 9, "max_area": 9, "max_char": [6, 16], "max_delta": 9, "max_dist": [], "max_gain": 9, "max_gamma": 9, "max_qual": 9, "max_ratio": 9, "maximum": [6, 9], "maxval": [8, 9], "mbox": 10, "mean": [9, 10, 12], "meaniou": 10, "meant": [7, 17], "measur": 18, "media": 1, "median": 8, "meet": 12, "member": 1, "memori": [13, 17], "mention": 18, "merg": 6, "messag": 2, "meta": 18, "metadata": 17, "metal": 3, "method": [7, 9, 18], "metric": [10, 18], "middl": 18, "might": [17, 18], "min": 9, "min_area": 9, "min_char": [6, 16], "min_gain": 9, "min_gamma": 9, "min_qual": 9, "min_ratio": 9, "min_val": 9, "minde": [1, 3, 4, 8], "minim": [2, 4], "minimalist": [4, 8], "minimum": [3, 6, 9, 10, 18], "minval": 9, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": 17, "mixed_precis": 17, "mjsynth": [4, 6, 16], "mnt": 6, "mobilenet": [8, 14], "mobilenet_v3_larg": 8, "mobilenet_v3_large_r": 8, "mobilenet_v3_smal": 8, "mobilenet_v3_small_crop_orient": 8, "mobilenet_v3_small_orient": [], "mobilenet_v3_small_page_orient": 8, "mobilenet_v3_small_r": 8, "mobilenetv3": 8, "modal": [4, 6], "mode": 3, "model": [6, 10, 13, 15, 16], "model_nam": [8, 14, 17], "model_path": [15, 17], "moder": 1, "modif": 2, "modifi": [8, 13, 18], "modul": [3, 7, 8, 9, 10, 18], "moment": [], "more": [2, 16, 18], "moscardi": [], "most": 18, "mozilla": 1, "multi": [4, 8], "multilingu": [6, 14], "multipl": [6, 7, 9, 18], "multipli": 9, "multiprocess": 13, "my": 8, "my_awesome_model": 14, "my_hook": 18, "n": [6, 10], "na": [], "name": [6, 8, 17, 18], "nation": 1, "natur": [1, 4, 6], "nb": [], "ndarrai": [6, 7, 9, 10], "necessari": [3, 12, 13], "need": [2, 3, 6, 10, 12, 13, 14, 15, 18], "neg": 9, "nest": 18, "nestedobject": [], "netraj": [], "network": [4, 6, 8, 17], "neural": [4, 6, 8, 17], "new": [2, 10], "newer": [], "next": [6, 16], "nois": 9, "noisi": [4, 6], "non": [4, 6, 7, 8, 9, 10], "none": [6, 7, 8, 9, 10, 18], "normal": [8, 9], "norwegian": 6, "note": [0, 2, 6, 8, 14, 15, 17], "now": 2, "np": [8, 9, 10, 18], "num_output_channel": 9, "num_sampl": [6, 16], "num_work": [], "number": [6, 9, 10, 18], "numpi": [7, 8, 10, 18], "o": 3, "obb": 15, "obj_detect": 14, "object": [6, 7, 10, 11, 15, 18], "objectness_scor": [7, 18], "oblig": 1, "obtain": 18, "occupi": 17, "ocr": [4, 6, 8, 10, 14, 16], "ocr_carea": 18, "ocr_db_crnn": 10, "ocr_lin": 18, "ocr_pag": 18, "ocr_par": 18, "ocr_predictor": [8, 12, 14, 17, 18], "ocrdataset": [6, 16], "ocrmetr": 10, "ocrpredictor": [8, 12], "ocrx_word": 18, "offens": 1, "offici": [1, 8], "offlin": 1, "offset": 9, "onc": 18, "one": [2, 6, 8, 9, 12, 14, 18], "oneof": 9, "ones": [6, 10], "onli": [2, 8, 9, 10, 14, 16, 17, 18], "onlin": 1, "onnx": 15, "onnxruntim": [15, 17], "onnxtr": 17, "opac": 9, "opacity_rang": 9, "open": [1, 2, 14, 17], "opinion": 1, "optic": [4, 18], "optim": [4, 18], "option": [6, 8, 12], "order": [2, 6, 7, 9], "org": [1, 6, 8, 18], "organ": 7, "orient": [1, 7, 8, 15, 18], "orientationpredictor": 8, "other": [1, 2], "otherwis": [1, 7, 10], "our": [2, 8, 18], "out": [2, 8, 9, 10, 18], "outpout": 18, "output": [7, 9, 17], "output_s": [7, 9], "outsid": 13, "over": [6, 10, 18], "overal": [1, 8], "overlai": 7, "overview": 15, "overwrit": [], "overwritten": 14, "own": 4, "p": [9, 18], "packag": [2, 4, 10, 13, 15, 16, 17], "pad": [6, 8, 9, 18], "page": [3, 6, 8, 10, 18], "page1": 7, "page2": 7, "page_1": 18, "page_idx": [7, 18], "page_orientation_predictor": 8, "page_param": [], "pair": 10, "pango": [], "paper": 8, "par_1_1": 18, "paragraph": 18, "paragraph_break": 18, "parallel": [], "param": [9, 18], "paramet": [4, 7, 8, 17], "pars": [4, 6], "parseq": [4, 8, 14, 17, 18], "part": [6, 9, 18], "parti": 3, "partial": 18, "particip": 1, "pass": [6, 7, 8, 18], "password": 7, "patch": [8, 10], "path": [6, 7, 15, 16, 17], "path_to_checkpoint": 12, "path_to_custom_model": 17, "path_to_pt": 12, "patil": [], "pattern": 1, "pdf": [7, 8, 11], "pdfpage": 7, "peopl": 1, "per": [9, 18], "perform": [4, 7, 8, 9, 10, 13, 17, 18], "period": 1, "permiss": 1, "permut": [4, 8], "persian_lett": 6, "person": [1, 16], "phase": 18, "photo": 16, "physic": [1, 7], "pick": 9, "pictur": 7, "pip": [2, 3, 15, 17], "pipelin": 18, "pixbuf": [], "pixel": [7, 9, 18], "platinum": [], "pleas": 2, "plot": 10, "plt": 10, "plug": 14, "plugin": 3, "png": 7, "point": 17, "polici": 13, "polish": 6, "polit": 1, "polygon": [6, 10, 18], "pool": 8, "portugues": 6, "posit": [1, 10], "possibl": [2, 10, 14, 18], "post": [1, 18], "postprocessor": 18, "potenti": 8, "power": 4, "ppageno": 18, "pre": [2, 8, 17], "precis": [10, 18], "pred": 10, "pred_box": 10, "pred_label": 10, "predefin": 16, "predict": [7, 8, 10, 18], "predictor": [4, 7, 8, 12, 14, 17], "prefer": 16, "preinstal": 3, "preprocessor": [12, 18], "prerequisit": 14, "present": 11, "preserv": [8, 9, 18], "preserve_aspect_ratio": [7, 8, 9, 12, 18], "pretrain": [4, 8, 10, 12, 17, 18], "pretrained_backbon": [8, 12], "print": 18, "prior": 6, "privaci": 1, "privat": 1, "probabl": 9, "problem": 2, "procedur": 9, "process": [2, 4, 7, 12, 18], "processor": 18, "produc": [11, 18], "product": 17, "profession": 1, "project": [2, 16], "promptli": 1, "proper": 2, "properli": 6, "properti": [], "provid": [1, 2, 4, 14, 15, 16, 18], "public": [1, 4], "publicli": 18, "publish": 1, "pull": 14, "punctuat": 6, "pure": 6, "purpos": 2, "push_to_hf_hub": [8, 14], "py": 14, "pypdfium2": [3, 7], "pyplot": [7, 10], "python": [2, 15], "python3": 14, "pytorch": [3, 4, 8, 9, 12, 14, 17, 18], "q": 2, "qr": [7, 15], "qr_code": 16, "qualiti": 9, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 11, "r": 18, "race": 1, "ramdisk": 6, "rand": [8, 9, 10, 17, 18], "random": [8, 9, 10, 18], "randomappli": 9, "randombright": 9, "randomcontrast": 9, "randomcrop": 9, "randomgamma": 9, "randomhorizontalflip": 9, "randomhu": 9, "randomjpegqu": 9, "randomli": 9, "randomres": 9, "randomrot": 9, "randomsatur": 9, "randomshadow": 9, "rang": 9, "rassi": 14, "ratio": [8, 9, 18], "raw": [7, 10], "re": 17, "read": [4, 6, 8], "read_html": 7, "read_img": [], "read_img_as_numpi": 7, "read_img_as_tensor": 7, "read_pdf": 7, "readi": 17, "real": [4, 8, 9], "realli": [], "reason": [1, 4, 6], "rebuild": 2, "rebuilt": 2, "recal": [10, 18], "receipt": [4, 6, 18], "reco_arch": [8, 12, 14, 17], "reco_b": 18, "reco_model": [12, 14, 17], "reco_param": 12, "reco_predictor": 12, "recogn": 18, "recognit": [6, 10, 12], "recognition_predictor": [8, 18], "recognition_task": [6, 16], "recognitiondataset": [6, 16], "recognitionpredictor": [8, 12], "rectangular": 8, "recurr": [], "red": [], "reduc": [3, 9], "refer": [2, 3, 12, 14, 15, 16, 18], "regardless": 1, "region": 18, "regroup": 10, "regular": 16, "reject": 1, "rel": [7, 9, 10, 18], "relat": 7, "releas": [0, 3], "relev": 15, "religion": 1, "relu": [], "remov": 1, "render": [7, 18], "repo": 8, "repo_id": [8, 14], "report": 1, "repositori": [6, 8, 14], "repres": [1, 17, 18], "represent": [4, 8], "request": [1, 14], "requir": [3, 9, 17], "research": 4, "residu": 8, "resiz": [9, 18], "resnet": 8, "resnet18": [8, 14], "resnet31": 8, "resnet34": 8, "resnet50": [8, 14], "resolv": 7, "resolve_block": 18, "resolve_lin": 18, "resourc": 16, "respect": 1, "respons": [], "rest": [2, 9, 10], "restrict": 13, "result": [2, 6, 7, 11, 14, 17, 18], "return": 18, "reusabl": 18, "review": 1, "rgb": [7, 9], "rgb_mode": 7, "rgb_output": 7, "right": [1, 8, 10], "roboflow": [], "robust": [4, 6], "root": 6, "rotat": [6, 7, 8, 9, 10, 16, 18], "rotated_bbox": [], "run": [2, 3, 8], "same": [2, 7, 10, 16, 17, 18], "sampl": [6, 16, 18], "sample_transform": 6, "sanjin": [], "sar": [4, 8], "sar_resnet31": [8, 18], "sar_vgg16_bn": [], "satur": 9, "save": [8, 16], "saved_model": [], "scale": [7, 8, 9, 10], "scale_rang": 9, "scan": [4, 6], "scene": [4, 6, 8], "scheme": [], "score": [7, 10], "scratch": [], "script": [2, 16], "seamless": 4, "seamlessli": [4, 18], "search": 8, "searchabl": 11, "sec": 18, "second": 18, "section": [12, 14, 15, 17, 18], "secur": [1, 13], "see": [1, 2], "seemlessli": [], "seen": 18, "segment": [4, 8, 18], "self": 18, "semant": [4, 8], "send": 18, "sens": 10, "sensit": 16, "separ": 18, "sequenc": [4, 6, 7, 8, 10, 18], "sequenti": [9, 18], "seri": 1, "serial": [], "serialized_model": [], "seriou": 1, "set": [1, 3, 6, 8, 10, 13, 15, 18], "set_global_polici": 17, "sever": [7, 9, 18], "sex": 1, "sexual": 1, "sha256": [], "shade": 9, "shape": [4, 7, 8, 9, 10, 18], "share": [13, 16], "shift": 9, "shm": 13, "should": [2, 6, 7, 9, 10], "show": [4, 7, 8, 10, 12, 14, 15], "showcas": 2, "shuffl": [6, 9], "side": 10, "signatur": 7, "signific": 16, "simpl": [4, 8, 17], "simpler": 8, "sinc": [6, 16], "singl": [1, 2, 4, 6], "single_img_doc": 17, "size": [1, 6, 7, 9, 15, 18], "skew": 18, "slack": 2, "slightli": 8, "small": [2, 8], "smallest": 7, "snapshot_download": 8, "snippet": 18, "so": [2, 3, 6, 8, 14, 16], "social": 1, "socio": 1, "some": [3, 11, 14, 16], "someth": 2, "somewher": 2, "soon": [], "sort": 1, "sourc": [6, 7, 8, 9, 10, 14], "space": [1, 18], "span": 18, "spanish": 6, "spatial": [4, 6, 7], "special": [], "specif": [2, 3, 10, 12, 16, 18], "specifi": [1, 6, 7], "speed": [4, 8], "sphinx": 2, "sroie": [4, 6, 16], "stabl": 3, "stackoverflow": 2, "stage": 4, "standalon": [], "standard": 9, "start": 6, "state": [4, 10, 15], "static": 10, "statist": [], "statu": 1, "std": [9, 12], "step": 13, "still": 18, "str": [6, 7, 8, 9, 10], "straight": [6, 8, 16, 18], "straighten": [], "straighten_pag": 8, "straigten_pag": [], "stream": 7, "street": [4, 6], "strict": 3, "strictli": 10, "string": [6, 7, 10, 18], "strive": 3, "strong": [4, 8], "structur": [17, 18], "subset": [6, 18], "suggest": [2, 14], "sum": 10, "summari": 10, "support": [3, 15, 17, 18], "sustain": 1, "svhn": [4, 6, 16], "svt": [6, 16], "swedish": 6, "symbol": [], "symmetr": [8, 9, 18], "symmetric_pad": [8, 9, 18], "synthes": [], "synthesize_pag": [], "synthet": 4, "synthtext": [4, 6, 16], "system": 18, "t": [2, 6, 12, 17, 18], "tabl": [14, 15], "take": [1, 6, 18], "target": [6, 7, 9, 10, 16], "target_s": 6, "task": [4, 6, 8, 14, 16, 18], "task2": 6, "team": 3, "techminde": 3, "templat": [2, 4], "tensor": [6, 7, 9, 18], "tensorflow": [3, 4, 7, 8, 9, 12, 14, 17, 18], "tensorspec": 17, "term": 1, "test": [6, 16], "test_set": 6, "text": [6, 7, 8, 10, 16], "text_output": 18, "textmatch": 10, "textnet": 8, "textnet_bas": 8, "textnet_smal": 8, "textnet_tini": 8, "textract": [4, 18], "textstylebrush": [4, 6], "textual": [4, 6, 7, 8, 18], "tf": [3, 7, 8, 9, 14, 17], "tf_model": [], "tflite": [], "than": [2, 10, 14], "thank": 2, "thei": [1, 10], "them": [6, 18], "thi": [1, 2, 3, 5, 6, 9, 10, 12, 13, 14, 16, 17, 18], "thing": [17, 18], "third": 3, "those": [1, 7, 18], "threaten": 1, "threshold": 18, "through": [1, 9, 15, 16], "tilman": 14, "time": [1, 4, 8, 10, 16], "tini": 8, "titl": [7, 18], "tm": 18, "tmp": 13, "togeth": [2, 7], "tograi": 9, "tool": 16, "top": [10, 17, 18], "topic": 2, "torch": [3, 9, 12, 14, 17], "torchvis": 9, "total": 12, "toward": [1, 3], "train": [2, 6, 8, 9, 14, 15, 16, 17, 18], "train_it": [6, 16], "train_load": [6, 16], "train_pytorch": 14, "train_set": [6, 16], "train_tensorflow": 14, "trainabl": [4, 8], "tranform": 9, "transcrib": 18, "transfer": [4, 6], "transfo": 9, "transform": [4, 6, 8], "translat": 1, "troll": 1, "true": [6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18], "truth": 10, "tune": 17, "tupl": [6, 7, 9, 10], "turn": [], "two": [7, 13], "txt": 6, "type": [7, 10, 14, 17, 18], "typic": 18, "u": [1, 2], "ucsd": 6, "udac": 2, "uint8": [7, 8, 10, 18], "ukrainian": [], "unaccept": 1, "underli": [16, 18], "underneath": 7, "understand": [4, 6, 18], "unidecod": [], "uniform": [8, 9], "uniformli": 9, "uninterrupt": [7, 18], "union": 10, "unit": [], "unittest": 2, "unlock": 7, "unoffici": 8, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [8, 18], "updat": 10, "upgrad": 2, "upper": [6, 9], "uppercas": 16, "url": 7, "us": [1, 2, 3, 6, 8, 10, 12, 13, 14, 15, 18], "usabl": 18, "usag": [13, 17], "use_broadcast": [], "use_polygon": [6, 10, 16], "useabl": 18, "user": [4, 7, 11], "utf": 18, "util": 17, "v0": [], "v1": 14, "v3": [8, 14, 18], "valid": 16, "valu": [2, 7, 9, 18], "valuabl": 4, "variabl": 13, "varieti": 6, "veri": 8, "verifi": [], "verma": [], "version": [1, 2, 3, 17, 18], "vgg": 8, "vgg16": 14, "vgg16_bn_r": 8, "via": 1, "video": [], "vietnames": 6, "view": [4, 6], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 6, 8], "visiondataset": 6, "visiontransform": 8, "visual": [3, 4, 15], "visualize_pag": 10, "vit_": 8, "vit_b": 8, "vitstr": [4, 8, 17], "vitstr_bas": [8, 18], "vitstr_smal": [8, 12, 17, 18], "viz": 3, "vocab": [12, 14, 16, 17, 18], "vocabulari": [6, 12, 14], "w": [7, 8, 9, 10], "w3": 18, "wa": 1, "wai": [1, 4, 16], "want": [2, 17, 18], "warm": [], "warmup": 18, "wasn": 2, "we": [1, 2, 3, 4, 7, 9, 14, 16, 17, 18], "weasyprint": 7, "web": [2, 7], "websit": 6, "weight": 12, "welcom": 1, "well": [1, 17], "were": [1, 7, 18], "what": 1, "when": [1, 2, 8], "whenev": 2, "where": [2, 7, 9, 10], "whether": [2, 6, 7, 9, 10, 16, 18], "which": [1, 8, 13, 15, 16, 18], "whichev": 3, "while": [9, 18], "why": 1, "width": [7, 9], "wiki": 1, "wildreceipt": [4, 6, 16], "window": [8, 10], "wish": 2, "within": 1, "without": [1, 6, 8], "wonder": 2, "word": [4, 6, 8, 10, 18], "word_1_1": 18, "word_1_2": 18, "word_1_3": 18, "wordgener": [6, 16], "words_onli": 10, "work": [13, 18], "worker": [], "workflow": 2, "worklow": 2, "world": [10, 18], "worth": 8, "wrap": 18, "wrapper": [6, 9], "write": 13, "written": [1, 7], "www": [1, 7, 18], "x": [7, 9, 10], "x12larg": [], "x_ascend": 18, "x_descend": 18, "x_i": 10, "x_size": 18, "x_wconf": 18, "xeon": [], "xhtml": 18, "xmax": 7, "xmin": 7, "xml": 18, "xml_bytes_str": 18, "xml_element": 18, "xml_output": 18, "xmln": 18, "y": 10, "y_i": 10, "y_j": 10, "yet": 15, "ymax": 7, "ymin": 7, "yolov8": 15, "you": [2, 3, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18], "your": [2, 4, 7, 10, 18], "yoursit": 7, "yugesh": [], "zero": [9, 10], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 6, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 6, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 6, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 6, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 6, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 6, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 6, "\u00e4\u00f6\u00e4\u00f6": 6, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 6, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 6, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 6, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 6, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 6, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 6, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 6, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 6, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 6, "\u067e\u0686\u06a2\u06a4\u06af": 6, "\u0905": 6, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 6, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 6, "\u0950": 6, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 6, "\u09bd": 6, "\u09ce": 6, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 6}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": [], "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": 0, "9": [], "advanc": 18, "approach": 18, "architectur": 18, "arg": [6, 7, 8, 9, 10], "artefact": 7, "artefactdetect": 15, "attribut": 1, "avail": [15, 16, 18], "aw": 13, "ban": 1, "block": 7, "bug": 2, "build": [], "changelog": 0, "choos": [16, 18], "classif": [8, 14], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 14, "compos": 9, "compress": [], "conda": 3, "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": 5, "contribut": [2, 5, 15], "contributor": 1, "convent": 14, "correct": 1, "coven": 1, "custom": [6, 12], "data": 16, "dataload": 6, "dataset": [4, 6, 16], "detect": [4, 8, 14, 16, 18], "develop": 2, "do": 18, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 11], "document": [2, 4, 7], "end": 18, "enforc": 1, "evalu": 10, "export": 17, "factori": 8, "featur": [2, 4], "feedback": 2, "file": 7, "from": 14, "gener": [6, 16], "get": [], "git": 3, "guidelin": 1, "half": 17, "hub": 14, "huggingfac": 14, "i": 18, "implement": [], "infer": 17, "instal": [2, 3], "integr": [2, 15], "io": 7, "lambda": 13, "let": 2, "line": 7, "linux": 3, "load": [12, 14, 16], "loader": 6, "main": 4, "mode": 2, "model": [4, 8, 12, 14, 17, 18], "modifi": 2, "modul": [5, 15], "name": 14, "note": [], "notebook": 11, "object": 16, "ocr": 18, "onli": 3, "onnx": 17, "optim": 17, "option": 18, "orient": [], "our": 1, "output": 18, "own": [12, 16], "packag": 3, "page": 7, "perman": 1, "pipelin": 15, "pledg": 1, "post": [], "pre": [], "precis": 17, "predictor": 18, "prepar": 17, "prerequisit": 3, "pretrain": 14, "process": [], "push": 14, "python": 3, "qualiti": 2, "question": 2, "read": 7, "readi": 16, "recognit": [4, 8, 14, 16, 18], "refer": [], "report": 2, "request": 2, "resourc": [], "respons": 1, "return": [6, 7, 8, 10], "right": 18, "savedmodel": [], "scope": 1, "share": 14, "should": 18, "stage": 18, "standard": 1, "start": [], "structur": [2, 7], "style": 2, "support": [4, 5, 6, 9], "synthet": [6, 16], "task": 10, "temporari": 1, "test": 2, "text": [4, 18], "train": 12, "transform": 9, "two": 18, "unit": 2, "us": [16, 17], "util": 10, "v0": 0, "verif": 2, "via": 3, "visual": 10, "vocab": 6, "warn": 1, "what": 18, "word": 7, "your": [12, 14, 15, 16, 17], "zoo": [4, 8]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[13, null]], "Advanced options": [[18, "advanced-options"]], "Args:": [[6, "args"], [6, "id4"], [6, "id7"], [6, "id10"], [6, "id13"], [6, "id16"], [6, "id19"], [6, "id22"], [6, "id25"], [6, "id29"], [6, "id32"], [6, "id37"], [6, "id40"], [6, "id46"], [6, "id49"], [6, "id50"], [6, "id51"], [6, "id54"], [6, "id57"], [6, "id60"], [6, "id61"], [7, "args"], [7, "id2"], [7, "id3"], [7, "id4"], [7, "id5"], [7, "id6"], [7, "id7"], [7, "id10"], [7, "id12"], [7, "id14"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id28"], [8, "args"], [8, "id3"], [8, "id8"], [8, "id13"], [8, "id17"], [8, "id21"], [8, "id26"], [8, "id31"], [8, "id36"], [8, "id41"], [8, "id46"], [8, "id50"], [8, "id54"], [8, "id59"], [8, "id63"], [8, "id68"], [8, "id73"], [8, "id77"], [8, "id81"], [8, "id85"], [8, "id90"], [8, "id95"], [8, "id99"], [8, "id104"], [8, "id109"], [8, "id114"], [8, "id119"], [8, "id123"], [8, "id127"], [8, "id132"], [8, "id137"], [8, "id142"], [8, "id146"], [8, "id150"], [8, "id155"], [8, "id159"], [8, "id163"], [8, "id167"], [8, "id169"], [8, "id171"], [8, "id173"], [9, "args"], [9, "id1"], [9, "id2"], [9, "id3"], [9, "id4"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"], [9, "id12"], [9, "id13"], [9, "id14"], [9, "id15"], [9, "id16"], [9, "id17"], [9, "id18"], [9, "id19"], [10, "args"], [10, "id3"], [10, "id4"], [10, "id5"], [10, "id6"], [10, "id7"], [10, "id8"], [10, "id9"]], "Artefact": [[7, "artefact"]], "ArtefactDetection": [[15, "artefactdetection"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[16, "available-datasets"]], "Available architectures": [[18, "available-architectures"], [18, "id1"], [18, "id2"]], "Available contribution modules": [[15, "available-contribution-modules"]], "Block": [[7, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[16, null]], "Choosing the right model": [[18, null]], "Classification": [[14, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[9, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[6, "custom-dataset-loader"]], "Data Loading": [[16, "data-loading"]], "Dataloader": [[6, "dataloader"]], "Detection": [[14, "detection"], [16, "detection"]], "Detection predictors": [[18, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[7, "document"]], "Document structure": [[7, "document-structure"]], "End-to-End OCR": [[18, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[17, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[7, "file-reading"]], "Half-precision": [[17, "half-precision"]], "Installation": [[3, null]], "Integrate contributions into your pipeline": [[15, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[7, "line"]], "Loading from Huggingface Hub": [[14, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[12, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[17, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[14, "naming-conventions"]], "Object Detection": [[16, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[7, "page"]], "Preparing your model for inference": [[17, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[14, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[14, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[14, "recognition"], [16, "recognition"]], "Recognition predictors": [[18, "recognition-predictors"]], "Returns:": [[6, "returns"], [7, "returns"], [7, "id11"], [7, "id13"], [7, "id15"], [7, "id19"], [7, "id23"], [7, "id27"], [7, "id31"], [8, "returns"], [8, "id6"], [8, "id11"], [8, "id16"], [8, "id20"], [8, "id24"], [8, "id29"], [8, "id34"], [8, "id39"], [8, "id44"], [8, "id49"], [8, "id53"], [8, "id57"], [8, "id62"], [8, "id66"], [8, "id71"], [8, "id76"], [8, "id80"], [8, "id84"], [8, "id88"], [8, "id93"], [8, "id98"], [8, "id102"], [8, "id107"], [8, "id112"], [8, "id117"], [8, "id122"], [8, "id126"], [8, "id130"], [8, "id135"], [8, "id140"], [8, "id145"], [8, "id149"], [8, "id153"], [8, "id158"], [8, "id162"], [8, "id166"], [8, "id168"], [8, "id170"], [8, "id172"], [10, "returns"]], "Scope": [[1, "scope"]], "Share your model with the community": [[14, null]], "Supported Vocabs": [[6, "supported-vocabs"]], "Supported contribution modules": [[5, "supported-contribution-modules"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[9, "supported-transformations"]], "Synthetic dataset generator": [[6, "synthetic-dataset-generator"], [16, "synthetic-dataset-generator"]], "Task evaluation": [[10, "task-evaluation"]], "Text Detection": [[18, "text-detection"]], "Text Recognition": [[18, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[12, null]], "Two-stage approaches": [[18, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[16, "use-your-own-datasets"]], "Using your ONNX exported model": [[17, "using-your-onnx-exported-model"]], "Via Conda (Only for Linux)": [[3, "via-conda-only-for-linux"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[10, "visualization"]], "What should I do with the output?": [[18, "what-should-i-do-with-the-output"]], "Word": [[7, "word"]], "docTR Notebooks": [[11, null]], "docTR Vocabs": [[6, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.contrib": [[5, null]], "doctr.datasets": [[6, null], [6, "datasets"]], "doctr.io": [[7, null]], "doctr.models": [[8, null]], "doctr.models.classification": [[8, "doctr-models-classification"]], "doctr.models.detection": [[8, "doctr-models-detection"]], "doctr.models.factory": [[8, "doctr-models-factory"]], "doctr.models.recognition": [[8, "doctr-models-recognition"]], "doctr.models.zoo": [[8, "doctr-models-zoo"]], "doctr.transforms": [[9, null]], "doctr.utils": [[10, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]], "v0.8.1 (2024-03-04)": [[0, "v0-8-1-2024-03-04"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/contrib", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_contrib_modules", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/contrib.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_contrib_modules.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[7, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[7, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[9, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[6, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[9, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[9, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[6, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[8, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[6, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[8, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[8, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[7, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[8, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[6, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[6, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[7, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[7, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[6, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[8, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[7, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[6, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[9, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[9, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[6, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[6, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[6, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[6, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[6, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[8, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[9, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[7, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[8, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[8, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[6, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_crop_orientation() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_crop_orientation", false]], "mobilenet_v3_small_page_orientation() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_page_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[9, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[8, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[6, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[9, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[7, "doctr.io.Page", false]], "page_orientation_predictor() (in module doctr.models.classification)": [[8, "doctr.models.classification.page_orientation_predictor", false]], "parseq() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[8, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[9, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[9, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[9, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[9, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[9, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[9, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[9, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[9, "doctr.transforms.RandomJpegQuality", false]], "randomresize (class in doctr.transforms)": [[9, "doctr.transforms.RandomResize", false]], "randomrotate (class in doctr.transforms)": [[9, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[9, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[9, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[7, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[7, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[7, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[7, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[6, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[9, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[8, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[7, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[7, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[6, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[10, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[10, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[10, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[10, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[6, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[6, "doctr.datasets.SVT", false]], "synthtext (class in doctr.datasets)": [[6, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[10, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[8, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[9, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[10, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[10, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[10, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[10, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[8, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[10, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[8, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[8, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[8, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[6, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[7, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[6, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[6, 0, 1, "", "CORD"], [6, 0, 1, "", "CharacterGenerator"], [6, 0, 1, "", "DetectionDataset"], [6, 0, 1, "", "DocArtefacts"], [6, 0, 1, "", "FUNSD"], [6, 0, 1, "", "IC03"], [6, 0, 1, "", "IC13"], [6, 0, 1, "", "IIIT5K"], [6, 0, 1, "", "IIITHWS"], [6, 0, 1, "", "IMGUR5K"], [6, 0, 1, "", "MJSynth"], [6, 0, 1, "", "OCRDataset"], [6, 0, 1, "", "RecognitionDataset"], [6, 0, 1, "", "SROIE"], [6, 0, 1, "", "SVHN"], [6, 0, 1, "", "SVT"], [6, 0, 1, "", "SynthText"], [6, 0, 1, "", "WILDRECEIPT"], [6, 0, 1, "", "WordGenerator"], [6, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[6, 0, 1, "", "DataLoader"]], "doctr.io": [[7, 0, 1, "", "Artefact"], [7, 0, 1, "", "Block"], [7, 0, 1, "", "Document"], [7, 0, 1, "", "DocumentFile"], [7, 0, 1, "", "Line"], [7, 0, 1, "", "Page"], [7, 0, 1, "", "Word"], [7, 1, 1, "", "decode_img_as_tensor"], [7, 1, 1, "", "read_html"], [7, 1, 1, "", "read_img_as_numpy"], [7, 1, 1, "", "read_img_as_tensor"], [7, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[7, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[7, 2, 1, "", "from_images"], [7, 2, 1, "", "from_pdf"], [7, 2, 1, "", "from_url"]], "doctr.io.Page": [[7, 2, 1, "", "show"]], "doctr.models": [[8, 1, 1, "", "kie_predictor"], [8, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[8, 1, 1, "", "crop_orientation_predictor"], [8, 1, 1, "", "magc_resnet31"], [8, 1, 1, "", "mobilenet_v3_large"], [8, 1, 1, "", "mobilenet_v3_large_r"], [8, 1, 1, "", "mobilenet_v3_small"], [8, 1, 1, "", "mobilenet_v3_small_crop_orientation"], [8, 1, 1, "", "mobilenet_v3_small_page_orientation"], [8, 1, 1, "", "mobilenet_v3_small_r"], [8, 1, 1, "", "page_orientation_predictor"], [8, 1, 1, "", "resnet18"], [8, 1, 1, "", "resnet31"], [8, 1, 1, "", "resnet34"], [8, 1, 1, "", "resnet50"], [8, 1, 1, "", "textnet_base"], [8, 1, 1, "", "textnet_small"], [8, 1, 1, "", "textnet_tiny"], [8, 1, 1, "", "vgg16_bn_r"], [8, 1, 1, "", "vit_b"], [8, 1, 1, "", "vit_s"]], "doctr.models.detection": [[8, 1, 1, "", "db_mobilenet_v3_large"], [8, 1, 1, "", "db_resnet50"], [8, 1, 1, "", "detection_predictor"], [8, 1, 1, "", "fast_base"], [8, 1, 1, "", "fast_small"], [8, 1, 1, "", "fast_tiny"], [8, 1, 1, "", "linknet_resnet18"], [8, 1, 1, "", "linknet_resnet34"], [8, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[8, 1, 1, "", "from_hub"], [8, 1, 1, "", "login_to_hub"], [8, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[8, 1, 1, "", "crnn_mobilenet_v3_large"], [8, 1, 1, "", "crnn_mobilenet_v3_small"], [8, 1, 1, "", "crnn_vgg16_bn"], [8, 1, 1, "", "master"], [8, 1, 1, "", "parseq"], [8, 1, 1, "", "recognition_predictor"], [8, 1, 1, "", "sar_resnet31"], [8, 1, 1, "", "vitstr_base"], [8, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[9, 0, 1, "", "ChannelShuffle"], [9, 0, 1, "", "ColorInversion"], [9, 0, 1, "", "Compose"], [9, 0, 1, "", "GaussianBlur"], [9, 0, 1, "", "GaussianNoise"], [9, 0, 1, "", "LambdaTransformation"], [9, 0, 1, "", "Normalize"], [9, 0, 1, "", "OneOf"], [9, 0, 1, "", "RandomApply"], [9, 0, 1, "", "RandomBrightness"], [9, 0, 1, "", "RandomContrast"], [9, 0, 1, "", "RandomCrop"], [9, 0, 1, "", "RandomGamma"], [9, 0, 1, "", "RandomHorizontalFlip"], [9, 0, 1, "", "RandomHue"], [9, 0, 1, "", "RandomJpegQuality"], [9, 0, 1, "", "RandomResize"], [9, 0, 1, "", "RandomRotate"], [9, 0, 1, "", "RandomSaturation"], [9, 0, 1, "", "RandomShadow"], [9, 0, 1, "", "Resize"], [9, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[10, 0, 1, "", "DetectionMetric"], [10, 0, 1, "", "LocalizationConfusion"], [10, 0, 1, "", "OCRMetric"], [10, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[10, 2, 1, "", "summary"], [10, 2, 1, "", "update"]], "doctr.utils.visualization": [[10, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 7, 8, 10, 14, 17], "0": [1, 3, 6, 9, 10, 12, 15, 16, 18], "00": 18, "01": 18, "0123456789": 6, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 6, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 6, "02": [], "02562": 8, "03": 18, "035": 18, "0361328125": 18, "04": 18, "05": 18, "06": 18, "06640625": 18, "07": 18, "08": [9, 18], "09": 18, "0966796875": 18, "1": [3, 6, 7, 8, 9, 10, 12, 16, 18], "10": [6, 10, 18], "100": [6, 9, 10, 16, 18], "1000": 18, "101": 6, "1024": [8, 12, 18], "104": 6, "106": 6, "108": 6, "1095": 16, "11": 18, "110": 10, "1107": 16, "114": 6, "115": [], "1156": 16, "116": 6, "118": 6, "11800h": 18, "11th": 18, "12": [3, 18], "120": 6, "123": 6, "126": 6, "1268": 16, "128": [8, 12, 17, 18], "13": 18, "130": 6, "13068": 16, "131": 6, "1337891": 16, "1357421875": 18, "1396484375": 18, "14": 18, "1420": 18, "14470v1": 6, "149": 16, "15": 18, "150": [10, 18], "1552": 18, "16": [8, 17, 18], "1630859375": 18, "1684": 18, "16x16": 8, "17": 18, "1778": 18, "1782": 18, "18": [8, 18], "185546875": 18, "1900": 18, "1910": 8, "19342": 16, "19370": 16, "195": 6, "19598": 16, "199": 18, "1999": 18, "2": [3, 4, 6, 7, 9, 15, 18], "20": 18, "200": 10, "2000": 16, "2003": [4, 6], "2012": 6, "2013": [4, 6], "2015": 6, "2019": 4, "2023": [], "207901": 16, "21": 18, "2103": 6, "2186": 16, "21888": 16, "22": 18, "224": [8, 9], "225": 9, "22672": 16, "229": [9, 16], "23": 18, "233": 16, "234": 6, "236": [], "24": 18, "246": 16, "249": 16, "25": 18, "2504": 18, "255": [7, 8, 9, 10, 18], "256": 8, "257": 16, "26": 18, "26032": 16, "264": 12, "27": 18, "2700": 16, "2710": 18, "2749": 12, "28": 18, "287": 12, "29": 18, "296": 12, "299": 12, "2d": 18, "3": [3, 4, 7, 8, 9, 10, 17, 18], "30": 18, "300": 16, "3000": 16, "301": 12, "30595": 18, "30ghz": 18, "31": 8, "32": [6, 8, 9, 12, 16, 17, 18], "3232421875": 18, "33": [9, 18], "33402": 16, "33608": 16, "34": [8, 18], "340": 18, "3456": 18, "35": [], "3515625": 18, "36": 18, "360": 16, "37": [6, 18], "38": 18, "39": 18, "4": [8, 9, 10, 18], "40": 18, "406": 9, "41": 18, "42": 18, "43": 18, "44": 18, "45": 18, "456": 9, "46": 18, "47": 18, "472": 16, "48": [6, 18], "485": 9, "49": 18, "49377": 16, "5": [6, 9, 10, 15, 18], "50": [8, 16, 18], "51": 18, "51171875": 18, "512": 8, "52": [6, 18], "529": 18, "53": 18, "54": 18, "540": 18, "5478515625": 18, "55": 18, "56": 18, "57": 18, "58": 18, "580": 18, "5810546875": 18, "583": 18, "59": 18, "597": 18, "5k": [4, 6], "5m": 18, "6": [9, 18], "60": 9, "600": [8, 10, 18], "61": 18, "62": 18, "626": 16, "63": 18, "64": [8, 9, 18], "641": 18, "647": 16, "65": 18, "66": 18, "67": 18, "68": 18, "69": 18, "693": 12, "694": 12, "695": 12, "6m": 18, "7": 18, "70": [6, 10, 18], "707470": 16, "71": [6, 18], "7100000": 16, "7141797": 16, "7149": 16, "72": 18, "72dpi": 7, "73": 18, "73257": 16, "74": 18, "75": [9, 18], "7581382": 16, "76": 18, "77": 18, "772": 12, "772875": 16, "78": 18, "785": 12, "79": 18, "793533": 16, "796": 16, "798": 12, "7m": 18, "8": [8, 9, 18], "80": 18, "800": [8, 10, 16, 18], "81": 18, "82": 18, "83": 18, "84": 18, "849": 16, "85": 18, "8564453125": 18, "857": 18, "85875": 16, "86": 18, "8603515625": 18, "87": 18, "8707": 16, "88": 18, "89": 18, "9": [3, 9, 18], "90": 18, "90k": 6, "90kdict32px": 6, "91": 18, "914085328578949": 18, "92": 18, "93": 18, "94": [6, 18], "95": [10, 18], "9578408598899841": 18, "96": 18, "97": 18, "98": 18, "99": 18, "9949972033500671": 18, "A": [1, 2, 4, 6, 7, 8, 11, 17], "As": 2, "Be": 18, "Being": 1, "By": 13, "For": [1, 2, 3, 12, 18], "If": [2, 7, 8, 12, 18], "In": [2, 6, 16], "It": [9, 14, 15, 17], "Its": [4, 8], "No": [1, 18], "Of": 6, "Or": [15, 17], "The": [1, 2, 6, 7, 10, 13, 15, 17, 18], "Then": 8, "To": [2, 3, 13, 14, 15, 17, 18], "_": [1, 6, 8], "__call__": 18, "_build": 2, "_i": 10, "ab": 6, "abc": 17, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 6, "abdef": [6, 16], "abl": [16, 18], "about": [1, 16, 18], "abov": 18, "abstract": [], "abstractdataset": 6, "abus": 1, "accept": 1, "access": [4, 7, 16, 18], "account": [1, 14], "accur": 18, "accuraci": 10, "achiev": 17, "act": 1, "action": 1, "activ": 4, "ad": [2, 8, 9], "adapt": 1, "add": [9, 10, 14, 18], "add_hook": 18, "add_label": 10, "addit": [2, 3, 7, 15], "addition": [2, 18], "address": [1, 7], "adjust": 9, "advanc": 1, "advantag": 17, "advis": 2, "aesthet": [4, 6], "affect": 1, "after": [14, 18], "ag": 1, "again": 8, "aggreg": [10, 16], "aggress": 1, "align": [1, 7, 9], "all": [1, 2, 5, 6, 7, 9, 10, 15, 16, 18], "allow": [1, 17], "along": 18, "alreadi": [2, 17], "also": [1, 8, 14, 15, 16, 18], "alwai": 16, "an": [1, 2, 4, 6, 7, 8, 10, 15, 17, 18], "analysi": [7, 15], "ancient_greek": 6, "andrej": [], "angl": [7, 9], "ani": [1, 6, 7, 8, 9, 10, 17, 18], "annot": 6, "anot": 16, "anoth": [8, 12, 16], "answer": 1, "anyascii": 10, "anyon": 4, "anyth": 15, "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 6, 9], "applic": [4, 8], "appoint": 1, "appreci": 14, "appropri": [1, 2, 18], "ar": [1, 2, 3, 5, 6, 7, 9, 10, 11, 15, 16, 18], "arab": 6, "arabic_diacrit": 6, "arabic_lett": 6, "arabic_punctu": 6, "arbitrarili": [4, 8], "arch": [8, 14], "architectur": [4, 8, 14, 15], "area": 18, "arg": [], "argument": [6, 7, 8, 10, 18], "around": 1, "arrai": [7, 9, 10], "art": [4, 15], "artefact": [10, 11, 15, 18], "artefact_typ": 7, "articl": [], "artifici": [4, 6], "arxiv": [6, 8], "asarrai": 10, "ascii_lett": 6, "aspect": [4, 8, 9, 18], "assess": 10, "assign": 10, "associ": 7, "assum": 8, "assume_straight_pag": [8, 18], "astyp": [8, 10, 18], "attack": 1, "attend": [4, 8], "attent": [1, 8], "autom": 4, "automat": 18, "autoregress": [4, 8], "avail": [1, 4, 5, 9], "averag": [9, 18], "avoid": [1, 3], "aw": [4, 18], "awar": 18, "azur": 18, "b": [8, 10, 18], "b_j": 10, "back": 2, "backbon": 8, "backend": 18, "background": 16, "bangla": 6, "bar": 15, "bar_cod": 16, "baranovskij": [], "base": [4, 8, 15], "baselin": [4, 8, 18], "batch": [6, 8, 9, 15, 16, 18], "batch_siz": [6, 12, 15, 16, 17], "bblanchon": 3, "bbox": 18, "becaus": 13, "been": [2, 10, 16, 18], "befor": [6, 8, 9, 18], "begin": 10, "behavior": [1, 18], "being": [10, 18], "belong": 18, "benchmark": 18, "best": 1, "better": [11, 18], "between": [9, 10, 18], "bgr": 7, "bilinear": 9, "bin_thresh": 18, "binar": [4, 8, 18], "binari": [7, 17, 18], "bit": 17, "blank": [], "block": [10, 18], "block_1_1": 18, "blue": [], "blur": 9, "bmvc": 6, "bn": 14, "bodi": [1, 18], "bool": [6, 7, 8, 9, 10], "boolean": [8, 18], "both": [4, 6, 9, 16, 18], "bottom": [8, 18], "bound": [6, 7, 8, 9, 10, 15, 18], "box": [6, 7, 8, 9, 10, 15, 16, 18], "box_thresh": 18, "brew": [], "bright": 9, "broadcast": [], "browser": [2, 4], "build": [2, 3, 17], "built": 2, "byte": [7, 18], "c": [3, 7, 10], "c_j": 10, "cach": [2, 6, 13], "cache_sampl": 6, "cairo": [], "call": 17, "callabl": [6, 9], "can": [2, 3, 12, 13, 14, 15, 16, 18], "capabl": [2, 11, 18], "case": [6, 10], "cf": 18, "cfg": 18, "challeng": 6, "challenge2_test_task12_imag": 6, "challenge2_test_task1_gt": 6, "challenge2_training_task12_imag": 6, "challenge2_training_task1_gt": 6, "chang": [13, 18], "channel": [1, 2, 7, 9], "channel_prior": 3, "channelshuffl": 9, "charact": [4, 6, 7, 10, 16, 18], "charactergener": [6, 16], "characterist": 1, "charg": 18, "charset": 18, "chart": 7, "check": [2, 14, 18], "checkpoint": 8, "chip": 3, "christian": [], "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 6, 7, 9, 10, 18], "class_nam": 12, "classif": 16, "classif_mobilenet_v3_smal": [], "classmethod": 7, "clear": 2, "clone": 3, "close": 2, "co": 14, "code": [4, 7, 15], "codecov": 2, "colab": 11, "collate_fn": 6, "collect": [7, 15], "color": 9, "colorinvers": 9, "column": 7, "com": [1, 3, 7, 8, 14], "combin": 18, "come": [], "command": [2, 15], "comment": 1, "commit": 1, "common": [1, 9, 10, 17], "commun": 1, "compar": 4, "comparison": [10, 18], "competit": 6, "compil": [11, 18], "complaint": 1, "complementari": 10, "complet": 2, "compon": 18, "compos": [6, 18], "comprehens": 18, "comput": [6, 10, 17, 18], "conf_threshold": 15, "confid": [7, 18], "config": [3, 8], "configur": 8, "confus": 10, "consecut": [9, 18], "consequ": 1, "consid": [1, 2, 6, 7, 10, 18], "consist": 18, "consolid": [4, 6], "constant": 9, "construct": 1, "consum": [], "contact": 1, "contain": [5, 6, 16], "content": [6, 7, 18], "context": 8, "contib": 3, "continu": 1, "contrast": 9, "contrast_factor": 9, "contrib": [3, 15], "contribut": 1, "contributor": 2, "convers": 7, "convert": [7, 9], "convolut": 8, "cool": [], "coordin": [7, 18], "cord": [4, 6, 16, 18], "core": [10, 18], "corner": 18, "correct": 9, "correspond": [3, 7, 9, 18], "could": [1, 15], "counterpart": 10, "cover": 2, "coverag": 2, "cpu": [4, 12, 17], "creat": 14, "crnn": [4, 8, 14], "crnn_mobilenet_v3_larg": [8, 14, 18], "crnn_mobilenet_v3_smal": [8, 17, 18], "crnn_vgg16_bn": [8, 12, 14, 18], "crop": [7, 8, 9, 16, 18], "crop_orient": [7, 18], "crop_orientation_predictor": 8, "crop_param": [], "croporientationpredictor": [], "cuda": 17, "currenc": 6, "current": [2, 18], "custom": [14, 15, 17, 18], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": 18, "cvit": 4, "czczup": 8, "czech": 6, "d": [6, 16], "danish": 6, "data": [4, 6, 7, 9, 10, 12, 14], "dataload": 16, "dataset": [8, 12, 18], "dataset_info": 6, "date": [12, 18], "db": 14, "db_mobilenet_v3_larg": [8, 14, 18], "db_resnet34": 18, "db_resnet50": [8, 12, 14, 18], "db_resnet50_rot": [], "dbnet": [4, 8], "deal": [], "decis": 1, "decod": 7, "decode_img_as_tensor": 7, "dedic": 17, "deem": 1, "deep": [8, 18], "def": 18, "default": [3, 7, 12, 13, 18], "defer": 16, "defin": [10, 17], "degre": [7, 9], "degress": 7, "delet": 2, "delimit": 18, "delta": 9, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4, 18], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": 8, "descript": 11, "design": 9, "desir": 7, "det_arch": [8, 12, 14, 17], "det_b": 18, "det_model": [12, 14, 17], "det_param": 12, "det_predictor": [12, 18], "detail": [12, 18], "detect": [6, 7, 10, 11, 12, 15], "detect_languag": 8, "detect_orient": 8, "detection_predictor": [8, 18], "detection_task": [], "detectiondataset": [6, 16], "detectionmetr": 10, "detectionpredictor": [8, 12], "detector": [4, 8, 15], "deterior": 8, "determin": 1, "dev": [2, 13], "develop": 3, "deviat": 9, "devic": 17, "dict": [7, 10, 18], "dictionari": [7, 10], "differ": 1, "differenti": [4, 8], "digit": [4, 6, 16], "dimens": [7, 10, 18], "dimension": 9, "direct": 6, "directli": [14, 18], "directori": [2, 13], "disabl": [1, 13, 18], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 18, "discuss": 2, "disparag": 1, "displai": [7, 10], "display_artefact": 10, "distribut": 9, "div": 18, "divers": 1, "divid": 7, "do": [2, 3, 8], "doc": [2, 7, 15, 17, 18], "docartefact": [6, 16], "docstr": 2, "doctr": [3, 12, 13, 14, 15, 16, 17, 18], "doctr_cache_dir": 13, "doctr_multiprocessing_dis": 13, "document": [6, 8, 10, 11, 15, 16, 17, 18], "documentbuild": 18, "documentfil": [7, 14, 15, 17], "doesn": 17, "don": [12, 18], "done": 9, "download": [6, 16], "downsiz": 8, "draw": 9, "draw_proba": [], "drop": 6, "drop_last": 6, "dtype": [7, 8, 9, 10, 17], "dual": [4, 6], "dummi": 14, "dummy_img": 18, "dummy_input": 17, "dure": 1, "dutch": 6, "dynam": [6, 15], "dynamic_seq_length": 6, "e": [1, 2, 3, 7, 8], "each": [4, 6, 7, 8, 9, 10, 16, 18], "eas": 2, "easi": [4, 10, 14, 17], "easili": [7, 10, 12, 14, 16, 18], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 6, 8], "either": [10, 18], "element": [6, 7, 8, 18], "els": [2, 15], "email": 1, "empathi": 1, "en": 18, "enabl": [6, 7], "enclos": 7, "encod": [4, 6, 7, 8, 18], "encode_sequ": 6, "encount": 2, "encrypt": 7, "end": [4, 6, 8, 10], "english": [6, 16], "enough": [2, 18], "ensur": 2, "entri": 6, "environ": [1, 13], "eo": 6, "equiv": 18, "estim": 8, "etc": [7, 15], "ethnic": 1, "evalu": [16, 18], "event": 1, "everyon": 1, "everyth": [2, 18], "exact": [10, 18], "exampl": [1, 2, 4, 6, 8, 14, 18], "exchang": 17, "execut": 18, "exist": 14, "expand": 9, "expect": [7, 9, 10], "experi": 1, "explan": [1, 18], "explicit": 1, "exploit": [4, 8], "export": [7, 8, 10, 11, 15, 18], "export_as_straight_box": [8, 18], "export_as_xml": 18, "export_model_to_onnx": 17, "express": [1, 9], "extens": 7, "extern": [1, 16], "extra": [], "extract": [4, 6], "extractor": 8, "f_": 10, "f_a": 10, "factor": 9, "fair": 1, "fairli": 1, "fals": [6, 7, 8, 9, 10, 12, 18], "famili": [], "faq": 1, "fascan": 14, "fast": [4, 6, 8], "fast_bas": [8, 18], "fast_smal": [8, 18], "fast_tini": [8, 18], "faster": [4, 8, 17], "fasterrcnn_mobilenet_v3_large_fpn": 8, "favorit": 18, "featur": [3, 8, 10, 11, 15], "feedback": 1, "feel": [2, 14], "felix92": 14, "few": [17, 18], "figsiz": 10, "figur": [10, 15], "file": [2, 6], "final": 8, "find": [2, 16], "fine": [], "finnish": 6, "first": [2, 6], "firsthand": 6, "fit": [8, 18], "flag": 18, "flip": 9, "float": [7, 9, 10, 17], "float32": [7, 8, 9, 17], "fn": 9, "focu": 14, "focus": [1, 6], "folder": 6, "follow": [1, 2, 3, 6, 9, 10, 12, 13, 14, 15, 18], "font": 6, "font_famili": 6, "font_siz": [], "foral": 10, "forc": 2, "forg": 3, "form": [4, 6, 18], "format": [7, 10, 12, 16, 17, 18], "forpost": [4, 6], "forum": 2, "found": [], "fp16": 17, "frac": 10, "framework": [3, 14, 16, 18], "free": [1, 2, 14], "french": [6, 12, 14, 18], "friendli": 4, "from": [1, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18], "from_hub": [8, 14], "from_imag": [7, 14, 15, 17], "from_pdf": 7, "from_url": 7, "full": [6, 10, 18], "function": [6, 9, 10, 15], "funsd": [4, 6, 16, 18], "further": 16, "futur": 6, "g": [7, 8], "g_": 10, "g_x": 10, "gallagh": [], "gamma": 9, "gaussian": 9, "gaussianblur": 9, "gaussiannois": 9, "gdk": [], "gen": 18, "gender": 1, "gener": [2, 4, 7, 8], "generic_cyrillic_lett": [], "geometri": [4, 7, 18], "geq": 10, "german": [6, 12, 14], "get": [17, 18], "git": 14, "github": [2, 3, 8, 14], "give": [1, 15], "given": [6, 7, 9, 10, 18], "global": 8, "go": 18, "good": 17, "googl": 2, "googlevis": 4, "gpu": [4, 15, 17], "gracefulli": 1, "graph": [4, 6, 7], "grayscal": 9, "ground": 10, "groung": 10, "group": [4, 18], "gt": 10, "gt_box": 10, "gt_label": 10, "gtk": [], "guid": 2, "guidanc": 16, "gvision": 18, "h": [7, 8, 9], "h_": 10, "ha": [2, 6, 10, 16], "handl": [16, 18], "handwrit": 6, "handwritten": 16, "harass": 1, "hardwar": 18, "harm": 1, "hat": 10, "have": [1, 2, 10, 12, 14, 16, 17, 18], "head": [8, 18], "healthi": 1, "hebrew": 6, "height": [7, 9], "hello": [10, 18], "help": 17, "here": [5, 9, 11, 15, 16, 18], "hf": 8, "hf_hub_download": 8, "high": 7, "higher": [3, 6, 18], "hindi": 6, "hindi_digit": 6, "hocr": 18, "homebrew": [], "hook": 18, "horizont": [7, 9], "hous": 6, "how": [2, 12, 14, 16], "howev": 16, "hsv": 9, "html": [1, 2, 3, 7, 18], "http": [1, 3, 6, 7, 8, 14, 18], "hub": 8, "hue": 9, "huggingfac": 8, "hw": 6, "i": [1, 2, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17], "i7": 18, "ibrahimov": [], "ic03": [4, 6, 16], "ic13": [4, 6, 16], "icdar": [4, 6], "icdar2019": 6, "id": 18, "ident": 1, "identifi": 4, "iiit": [4, 6], "iiit5k": [6, 16], "iiithw": [4, 6, 16], "imag": [4, 6, 7, 8, 9, 10, 14, 15, 16, 18], "imagenet": 8, "imageri": 1, "images_90k_norm": 6, "img": [6, 9, 16, 17], "img_cont": 7, "img_fold": [6, 16], "img_path": 7, "img_transform": 6, "imgur5k": [4, 6, 16], "imgur5k_annot": 6, "imlist": 6, "impact": 1, "implement": [6, 7, 8, 9, 10, 18], "import": [6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18], "improv": 8, "inappropri": 1, "incid": 1, "includ": [1, 6, 16, 17], "inclus": 1, "increas": 9, "independ": 9, "index": [2, 7], "indic": 10, "individu": 1, "infer": [4, 8, 9, 15], "inform": [1, 2, 4, 6, 16], "input": [2, 7, 8, 9, 17, 18], "input_crop": 8, "input_pag": [8, 10, 18], "input_shap": 17, "input_tensor": 8, "inspir": [1, 9], "instal": [14, 15, 17], "instanc": [1, 18], "instanti": [8, 18], "instead": [6, 7, 8], "insult": 1, "int": [6, 7, 9], "int64": 10, "integ": 10, "integr": [4, 14, 16], "intel": 18, "interact": [1, 7, 10], "interfac": [14, 17], "interoper": 17, "interpol": 9, "interpret": [6, 7], "intersect": 10, "invert": 9, "investig": 1, "invis": 1, "involv": [1, 18], "io": [14, 15, 17], "iou": 10, "iou_thresh": 10, "iou_threshold": 15, "irregular": [4, 8, 16], "isn": 6, "issu": [1, 2, 14], "italian": 6, "iter": [6, 9, 16, 18], "its": [7, 8, 9, 10, 16, 18], "itself": [8, 14], "j": 10, "jame": [], "job": 2, "join": 2, "jpeg": 9, "jpegqual": 9, "jpg": [6, 7, 14, 17], "json": [6, 16, 18], "json_output": 18, "jump": 2, "just": 1, "kei": [4, 6], "kera": [8, 17], "kernel": [4, 8, 9], "kernel_shap": 9, "keywoard": 8, "keyword": [6, 7, 8, 10], "kie": [8, 12], "kie_predictor": [8, 12], "kiepredictor": 8, "kind": 1, "know": [2, 17], "kwarg": [6, 7, 8, 10], "l": 10, "l_j": 10, "label": [6, 10, 15, 16], "label_fil": [6, 16], "label_fold": 6, "label_path": [6, 16], "labels_path": [6, 16], "ladder": 1, "lambda": 9, "lambdatransform": 9, "lang": 18, "languag": [1, 4, 6, 7, 8, 14, 18], "larg": [8, 14], "largest": 10, "last": [3, 6], "latenc": 8, "later": 2, "latest": 18, "latin": 6, "layer": 17, "layout": 18, "lead": 1, "leader": 1, "learn": [1, 4, 8, 17, 18], "least": 3, "left": [10, 18], "legacy_french": 6, "length": [6, 18], "less": [17, 18], "level": [1, 6, 10, 18], "leverag": 11, "lf": 14, "libffi": [], "librari": [2, 3, 11, 12], "light": 4, "lightweight": 17, "like": 1, "limits_": 10, "line": [4, 8, 10, 18], "line_1_1": 18, "link": 12, "linknet": [4, 8], "linknet_resnet18": [8, 12, 17, 18], "linknet_resnet18_rot": [], "linknet_resnet34": [8, 17, 18], "linknet_resnet50": [8, 18], "linux": [], "list": [6, 7, 9, 10, 14], "ll": 10, "load": [4, 6, 8, 15, 17], "load_state_dict": 12, "load_weight": 12, "loc_pr": 18, "local": [2, 4, 6, 8, 10, 16, 18], "localis": 6, "localizationconfus": 10, "locat": [2, 7, 18], "login": 8, "login_to_hub": [8, 14], "logo": [7, 15, 16], "love": 14, "lower": [9, 10, 18], "m": [2, 10, 18], "m1": 3, "macbook": 3, "machin": 17, "maco": [], "made": 4, "magc_resnet31": 8, "mai": [1, 2], "mail": 1, "main": 11, "maintain": 4, "mainten": 2, "make": [1, 2, 10, 13, 14, 17, 18], "mani": [16, 18], "manipul": 18, "map": [6, 8], "map_loc": 12, "mask_shap": [], "master": [4, 8, 18], "match": [10, 18], "mathcal": 10, "matplotlib": [7, 10], "max": [6, 9, 10], "max_angl": 9, "max_area": 9, "max_char": [6, 16], "max_delta": 9, "max_gain": 9, "max_gamma": 9, "max_qual": 9, "max_ratio": 9, "maximum": [6, 9], "maxval": [8, 9], "mbox": 10, "mean": [9, 10, 12], "meaniou": 10, "meant": [7, 17], "measur": 18, "media": 1, "median": 8, "meet": 12, "member": 1, "memori": [13, 17], "mention": 18, "merg": 6, "messag": 2, "meta": 18, "metadata": 17, "metal": 3, "method": [7, 9, 18], "metric": [10, 18], "middl": 18, "might": [17, 18], "min": 9, "min_area": 9, "min_char": [6, 16], "min_gain": 9, "min_gamma": 9, "min_qual": 9, "min_ratio": 9, "min_val": 9, "minde": [1, 3, 4, 8], "minim": [2, 4], "minimalist": [4, 8], "minimum": [3, 6, 9, 10, 18], "minval": 9, "miss": 3, "mistak": 1, "mixed_float16": 17, "mixed_precis": 17, "mjsynth": [4, 6, 16], "mnt": 6, "mobilenet": [8, 14], "mobilenet_v3_larg": 8, "mobilenet_v3_large_r": 8, "mobilenet_v3_smal": 8, "mobilenet_v3_small_crop_orient": 8, "mobilenet_v3_small_orient": [], "mobilenet_v3_small_page_orient": 8, "mobilenet_v3_small_r": 8, "mobilenetv3": 8, "modal": [4, 6], "mode": 3, "model": [6, 10, 13, 15, 16], "model_nam": [8, 14, 17], "model_path": [15, 17], "moder": 1, "modif": 2, "modifi": [8, 13, 18], "modul": [3, 7, 8, 9, 10, 18], "moment": [], "more": [2, 16, 18], "moscardi": [], "most": 18, "mozilla": 1, "multi": [4, 8], "multilingu": [6, 14], "multipl": [6, 7, 9, 18], "multipli": 9, "multiprocess": 13, "my": 8, "my_awesome_model": 14, "my_hook": 18, "n": [6, 10], "name": [6, 8, 17, 18], "nation": 1, "natur": [1, 4, 6], "nb": [], "ndarrai": [6, 7, 9, 10], "necessari": [3, 12, 13], "need": [2, 3, 6, 10, 12, 13, 14, 15, 18], "neg": 9, "nest": 18, "netraj": [], "network": [4, 6, 8, 17], "neural": [4, 6, 8, 17], "new": [2, 10], "next": [6, 16], "nois": 9, "noisi": [4, 6], "non": [4, 6, 7, 8, 9, 10], "none": [6, 7, 8, 9, 10, 18], "normal": [8, 9], "norwegian": 6, "note": [0, 2, 6, 8, 14, 15, 17], "now": 2, "np": [8, 9, 10, 18], "num_output_channel": 9, "num_sampl": [6, 16], "num_work": [], "number": [6, 9, 10, 18], "numpi": [7, 8, 10, 18], "o": 3, "obb": 15, "obj_detect": 14, "object": [6, 7, 10, 11, 15, 18], "objectness_scor": [7, 18], "oblig": 1, "obtain": 18, "occupi": 17, "ocr": [4, 6, 8, 10, 14, 16], "ocr_carea": 18, "ocr_db_crnn": 10, "ocr_lin": 18, "ocr_pag": 18, "ocr_par": 18, "ocr_predictor": [8, 12, 14, 17, 18], "ocrdataset": [6, 16], "ocrmetr": 10, "ocrpredictor": [8, 12], "ocrx_word": 18, "offens": 1, "offici": [1, 8], "offlin": 1, "offset": 9, "onc": 18, "one": [2, 6, 8, 9, 12, 14, 18], "oneof": 9, "ones": [6, 10], "onli": [2, 8, 9, 10, 14, 16, 17, 18], "onlin": 1, "onnx": 15, "onnxruntim": [15, 17], "onnxtr": 17, "opac": 9, "opacity_rang": 9, "open": [1, 2, 14, 17], "opinion": 1, "optic": [4, 18], "optim": [4, 18], "option": [6, 8, 12], "order": [2, 6, 7, 9], "org": [1, 6, 8, 18], "organ": 7, "orient": [1, 7, 8, 15, 18], "orientationpredictor": 8, "other": [1, 2], "otherwis": [1, 7, 10], "our": [2, 8, 18], "out": [2, 8, 9, 10, 18], "outpout": 18, "output": [7, 9, 17], "output_s": [7, 9], "outsid": 13, "over": [6, 10, 18], "overal": [1, 8], "overlai": 7, "overview": 15, "overwrit": [], "overwritten": 14, "own": 4, "p": [9, 18], "packag": [2, 4, 10, 13, 15, 16, 17], "pad": [6, 8, 9, 18], "page": [3, 6, 8, 10, 18], "page1": 7, "page2": 7, "page_1": 18, "page_idx": [7, 18], "page_orientation_predictor": 8, "page_param": [], "pair": 10, "pango": [], "paper": 8, "par_1_1": 18, "paragraph": 18, "paragraph_break": 18, "parallel": [], "param": [9, 18], "paramet": [4, 7, 8, 17], "pars": [4, 6], "parseq": [4, 8, 14, 17, 18], "part": [6, 9, 18], "parti": 3, "partial": 18, "particip": 1, "pass": [6, 7, 8, 18], "password": 7, "patch": [8, 10], "path": [6, 7, 15, 16, 17], "path_to_checkpoint": 12, "path_to_custom_model": 17, "path_to_pt": 12, "patil": [], "pattern": 1, "pdf": [7, 8, 11], "pdfpage": 7, "peopl": 1, "per": [9, 18], "perform": [4, 7, 8, 9, 10, 13, 17, 18], "period": 1, "permiss": 1, "permut": [4, 8], "persian_lett": 6, "person": [1, 16], "phase": 18, "photo": 16, "physic": [1, 7], "pick": 9, "pictur": 7, "pip": [2, 3, 15, 17], "pipelin": 18, "pixbuf": [], "pixel": [7, 9, 18], "pleas": 2, "plot": 10, "plt": 10, "plug": 14, "plugin": 3, "png": 7, "point": 17, "polici": 13, "polish": 6, "polit": 1, "polygon": [6, 10, 18], "pool": 8, "portugues": 6, "posit": [1, 10], "possibl": [2, 10, 14, 18], "post": [1, 18], "postprocessor": 18, "potenti": 8, "power": 4, "ppageno": 18, "pre": [2, 8, 17], "precis": [10, 18], "pred": 10, "pred_box": 10, "pred_label": 10, "predefin": 16, "predict": [7, 8, 10, 18], "predictor": [4, 7, 8, 12, 14, 17], "prefer": 16, "preinstal": 3, "preprocessor": [12, 18], "prerequisit": 14, "present": 11, "preserv": [8, 9, 18], "preserve_aspect_ratio": [7, 8, 9, 12, 18], "pretrain": [4, 8, 10, 12, 17, 18], "pretrained_backbon": [8, 12], "print": 18, "prior": 6, "privaci": 1, "privat": 1, "probabl": 9, "problem": 2, "procedur": 9, "process": [2, 4, 7, 12, 18], "processor": 18, "produc": [11, 18], "product": 17, "profession": 1, "project": [2, 16], "promptli": 1, "proper": 2, "properli": 6, "provid": [1, 2, 4, 14, 15, 16, 18], "public": [1, 4], "publicli": 18, "publish": 1, "pull": 14, "punctuat": 6, "pure": 6, "purpos": 2, "push_to_hf_hub": [8, 14], "py": 14, "pypdfium2": [3, 7], "pyplot": [7, 10], "python": [2, 15], "python3": 14, "pytorch": [3, 4, 8, 9, 12, 14, 17, 18], "q": 2, "qr": [7, 15], "qr_code": 16, "qualiti": 9, "question": 1, "quickli": 4, "quicktour": 11, "r": 18, "race": 1, "ramdisk": 6, "rand": [8, 9, 10, 17, 18], "random": [8, 9, 10, 18], "randomappli": 9, "randombright": 9, "randomcontrast": 9, "randomcrop": 9, "randomgamma": 9, "randomhorizontalflip": 9, "randomhu": 9, "randomjpegqu": 9, "randomli": 9, "randomres": 9, "randomrot": 9, "randomsatur": 9, "randomshadow": 9, "rang": 9, "rassi": 14, "ratio": [8, 9, 18], "raw": [7, 10], "re": 17, "read": [4, 6, 8], "read_html": 7, "read_img": [], "read_img_as_numpi": 7, "read_img_as_tensor": 7, "read_pdf": 7, "readi": 17, "real": [4, 8, 9], "realli": [], "reason": [1, 4, 6], "rebuild": 2, "rebuilt": 2, "recal": [10, 18], "receipt": [4, 6, 18], "reco_arch": [8, 12, 14, 17], "reco_b": 18, "reco_model": [12, 14, 17], "reco_param": 12, "reco_predictor": 12, "recogn": 18, "recognit": [6, 10, 12], "recognition_predictor": [8, 18], "recognition_task": [6, 16], "recognitiondataset": [6, 16], "recognitionpredictor": [8, 12], "rectangular": 8, "red": [], "reduc": [3, 9], "refer": [2, 3, 12, 14, 15, 16, 18], "regardless": 1, "region": 18, "regroup": 10, "regular": 16, "reject": 1, "rel": [7, 9, 10, 18], "relat": 7, "releas": [0, 3], "relev": 15, "religion": 1, "remov": 1, "render": [7, 18], "repo": 8, "repo_id": [8, 14], "report": 1, "repositori": [6, 8, 14], "repres": [1, 17, 18], "represent": [4, 8], "request": [1, 14], "requir": [3, 9, 17], "research": 4, "residu": 8, "resiz": [9, 18], "resnet": 8, "resnet18": [8, 14], "resnet31": 8, "resnet34": 8, "resnet50": [8, 14], "resolv": 7, "resolve_block": 18, "resolve_lin": 18, "resourc": 16, "respect": 1, "respons": [], "rest": [2, 9, 10], "restrict": 13, "result": [2, 6, 7, 11, 14, 17, 18], "return": 18, "reusabl": 18, "review": 1, "rgb": [7, 9], "rgb_mode": 7, "rgb_output": 7, "right": [1, 8, 10], "roboflow": [], "robust": [4, 6], "root": 6, "rotat": [6, 7, 8, 9, 10, 16, 18], "run": [2, 3, 8], "same": [2, 7, 10, 16, 17, 18], "sampl": [6, 16, 18], "sample_transform": 6, "sanjin": [], "sar": [4, 8], "sar_resnet31": [8, 18], "satur": 9, "save": [8, 16], "scale": [7, 8, 9, 10], "scale_rang": 9, "scan": [4, 6], "scene": [4, 6, 8], "score": [7, 10], "script": [2, 16], "seamless": 4, "seamlessli": [4, 18], "search": 8, "searchabl": 11, "sec": 18, "second": 18, "section": [12, 14, 15, 17, 18], "secur": [1, 13], "see": [1, 2], "seen": 18, "segment": [4, 8, 18], "self": 18, "semant": [4, 8], "send": 18, "sens": 10, "sensit": 16, "separ": 18, "sequenc": [4, 6, 7, 8, 10, 18], "sequenti": [9, 18], "seri": 1, "seriou": 1, "set": [1, 3, 6, 8, 10, 13, 15, 18], "set_global_polici": 17, "sever": [7, 9, 18], "sex": 1, "sexual": 1, "shade": 9, "shape": [4, 7, 8, 9, 10, 18], "share": [13, 16], "shift": 9, "shm": 13, "should": [2, 6, 7, 9, 10], "show": [4, 7, 8, 10, 12, 14, 15], "showcas": 2, "shuffl": [6, 9], "side": 10, "signatur": 7, "signific": 16, "simpl": [4, 8, 17], "simpler": 8, "sinc": [6, 16], "singl": [1, 2, 4, 6], "single_img_doc": 17, "size": [1, 6, 7, 9, 15, 18], "skew": 18, "slack": 2, "slightli": 8, "small": [2, 8], "smallest": 7, "snapshot_download": 8, "snippet": 18, "so": [2, 3, 6, 8, 14, 16], "social": 1, "socio": 1, "some": [3, 11, 14, 16], "someth": 2, "somewher": 2, "soon": [], "sort": 1, "sourc": [6, 7, 8, 9, 10, 14], "space": [1, 18], "span": 18, "spanish": 6, "spatial": [4, 6, 7], "specif": [2, 3, 10, 12, 16, 18], "specifi": [1, 6, 7], "speed": [4, 8], "sphinx": 2, "sroie": [4, 6, 16], "stabl": 3, "stackoverflow": 2, "stage": 4, "standalon": [], "standard": 9, "start": 6, "state": [4, 10, 15], "static": 10, "statist": [], "statu": 1, "std": [9, 12], "step": 13, "still": 18, "str": [6, 7, 8, 9, 10], "straight": [6, 8, 16, 18], "straighten": [], "straighten_pag": 8, "straigten_pag": [], "stream": 7, "street": [4, 6], "strict": 3, "strictli": 10, "string": [6, 7, 10, 18], "strive": 3, "strong": [4, 8], "structur": [17, 18], "subset": [6, 18], "suggest": [2, 14], "sum": 10, "summari": 10, "support": [3, 15, 17, 18], "sustain": 1, "svhn": [4, 6, 16], "svt": [6, 16], "swedish": 6, "symmetr": [8, 9, 18], "symmetric_pad": [8, 9, 18], "synthes": [], "synthesize_pag": [], "synthet": 4, "synthtext": [4, 6, 16], "system": 18, "t": [2, 6, 12, 17, 18], "tabl": [14, 15], "take": [1, 6, 18], "target": [6, 7, 9, 10, 16], "target_s": 6, "task": [4, 6, 8, 14, 16, 18], "task2": 6, "team": 3, "techminde": 3, "templat": [2, 4], "tensor": [6, 7, 9, 18], "tensorflow": [3, 4, 7, 8, 9, 12, 14, 17, 18], "tensorspec": 17, "term": 1, "test": [6, 16], "test_set": 6, "text": [6, 7, 8, 10, 16], "text_output": 18, "textmatch": 10, "textnet": 8, "textnet_bas": 8, "textnet_smal": 8, "textnet_tini": 8, "textract": [4, 18], "textstylebrush": [4, 6], "textual": [4, 6, 7, 8, 18], "tf": [3, 7, 8, 9, 14, 17], "than": [2, 10, 14], "thank": 2, "thei": [1, 10], "them": [6, 18], "thi": [1, 2, 3, 5, 6, 9, 10, 12, 13, 14, 16, 17, 18], "thing": [17, 18], "third": 3, "those": [1, 7, 18], "threaten": 1, "threshold": 18, "through": [1, 9, 15, 16], "tilman": 14, "time": [1, 4, 8, 10, 16], "tini": 8, "titl": [7, 18], "tm": 18, "tmp": 13, "togeth": [2, 7], "tograi": 9, "tool": 16, "top": [10, 17, 18], "topic": 2, "torch": [3, 9, 12, 14, 17], "torchvis": 9, "total": 12, "toward": [1, 3], "train": [2, 6, 8, 9, 14, 15, 16, 17, 18], "train_it": [6, 16], "train_load": [6, 16], "train_pytorch": 14, "train_set": [6, 16], "train_tensorflow": 14, "trainabl": [4, 8], "tranform": 9, "transcrib": 18, "transfer": [4, 6], "transfo": 9, "transform": [4, 6, 8], "translat": 1, "troll": 1, "true": [6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18], "truth": 10, "tune": 17, "tupl": [6, 7, 9, 10], "two": [7, 13], "txt": 6, "type": [7, 10, 14, 17, 18], "typic": 18, "u": [1, 2], "ucsd": 6, "udac": 2, "uint8": [7, 8, 10, 18], "ukrainian": [], "unaccept": 1, "underli": [16, 18], "underneath": 7, "understand": [4, 6, 18], "unidecod": [], "uniform": [8, 9], "uniformli": 9, "uninterrupt": [7, 18], "union": 10, "unit": [], "unittest": 2, "unlock": 7, "unoffici": 8, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [8, 18], "updat": 10, "upgrad": 2, "upper": [6, 9], "uppercas": 16, "url": 7, "us": [1, 2, 3, 6, 8, 10, 12, 13, 14, 15, 18], "usabl": 18, "usag": [13, 17], "use_broadcast": [], "use_polygon": [6, 10, 16], "useabl": 18, "user": [4, 7, 11], "utf": 18, "util": 17, "v1": 14, "v3": [8, 14, 18], "valid": 16, "valu": [2, 7, 9, 18], "valuabl": 4, "variabl": 13, "varieti": 6, "veri": 8, "verma": [], "version": [1, 2, 3, 17, 18], "vgg": 8, "vgg16": 14, "vgg16_bn_r": 8, "via": 1, "video": [], "vietnames": 6, "view": [4, 6], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 6, 8], "visiondataset": 6, "visiontransform": 8, "visual": [3, 4, 15], "visualize_pag": 10, "vit_": 8, "vit_b": 8, "vitstr": [4, 8, 17], "vitstr_bas": [8, 18], "vitstr_smal": [8, 12, 17, 18], "viz": 3, "vocab": [12, 14, 16, 17, 18], "vocabulari": [6, 12, 14], "w": [7, 8, 9, 10], "w3": 18, "wa": 1, "wai": [1, 4, 16], "want": [2, 17, 18], "warmup": 18, "wasn": 2, "we": [1, 2, 3, 4, 7, 9, 14, 16, 17, 18], "weasyprint": 7, "web": [2, 7], "websit": 6, "weight": 12, "welcom": 1, "well": [1, 17], "were": [1, 7, 18], "what": 1, "when": [1, 2, 8], "whenev": 2, "where": [2, 7, 9, 10], "whether": [2, 6, 7, 9, 10, 16, 18], "which": [1, 8, 13, 15, 16, 18], "whichev": 3, "while": [9, 18], "why": 1, "width": [7, 9], "wiki": 1, "wildreceipt": [4, 6, 16], "window": [8, 10], "wish": 2, "within": 1, "without": [1, 6, 8], "wonder": 2, "word": [4, 6, 8, 10, 18], "word_1_1": 18, "word_1_2": 18, "word_1_3": 18, "wordgener": [6, 16], "words_onli": 10, "work": [13, 18], "worker": [], "workflow": 2, "worklow": 2, "world": [10, 18], "worth": 8, "wrap": 18, "wrapper": [6, 9], "write": 13, "written": [1, 7], "www": [1, 7, 18], "x": [7, 9, 10], "x_ascend": 18, "x_descend": 18, "x_i": 10, "x_size": 18, "x_wconf": 18, "xhtml": 18, "xmax": 7, "xmin": 7, "xml": 18, "xml_bytes_str": 18, "xml_element": 18, "xml_output": 18, "xmln": 18, "y": 10, "y_i": 10, "y_j": 10, "yet": 15, "ymax": 7, "ymin": 7, "yolov8": 15, "you": [2, 3, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18], "your": [2, 4, 7, 10, 18], "yoursit": 7, "yugesh": [], "zero": [9, 10], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 6, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 6, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 6, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 6, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 6, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 6, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 6, "\u00e4\u00f6\u00e4\u00f6": 6, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 6, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 6, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 6, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 6, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 6, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 6, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 6, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 6, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 6, "\u067e\u0686\u06a2\u06a4\u06af": 6, "\u0905": 6, "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": 6, "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": 6, "\u0950": 6, "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": 6, "\u09bd": 6, "\u09ce": 6, "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": 6}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.contrib", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Integrate contributions into your pipeline", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": 0, "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "21": [], "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": 0, "9": [], "advanc": 18, "approach": 18, "architectur": 18, "arg": [6, 7, 8, 9, 10], "artefact": 7, "artefactdetect": 15, "attribut": 1, "avail": [15, 16, 18], "aw": 13, "ban": 1, "block": 7, "bug": 2, "changelog": 0, "choos": [16, 18], "classif": [8, 14], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 14, "compos": 9, "conda": 3, "conduct": 1, "connect": 2, "continu": 2, "contrib": 5, "contribut": [2, 5, 15], "contributor": 1, "convent": 14, "correct": 1, "coven": 1, "custom": [6, 12], "data": 16, "dataload": 6, "dataset": [4, 6, 16], "detect": [4, 8, 14, 16, 18], "develop": 2, "do": 18, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 11], "document": [2, 4, 7], "end": 18, "enforc": 1, "evalu": 10, "export": 17, "factori": 8, "featur": [2, 4], "feedback": 2, "file": 7, "from": 14, "gener": [6, 16], "git": 3, "guidelin": 1, "half": 17, "hub": 14, "huggingfac": 14, "i": 18, "infer": 17, "instal": [2, 3], "integr": [2, 15], "io": 7, "lambda": 13, "let": 2, "line": 7, "linux": 3, "load": [12, 14, 16], "loader": 6, "main": 4, "mode": 2, "model": [4, 8, 12, 14, 17, 18], "modifi": 2, "modul": [5, 15], "name": 14, "notebook": 11, "object": 16, "ocr": 18, "onli": 3, "onnx": 17, "optim": 17, "option": 18, "orient": [], "our": 1, "output": 18, "own": [12, 16], "packag": 3, "page": 7, "perman": 1, "pipelin": 15, "pledg": 1, "precis": 17, "predictor": 18, "prepar": 17, "prerequisit": 3, "pretrain": 14, "push": 14, "python": 3, "qualiti": 2, "question": 2, "read": 7, "readi": 16, "recognit": [4, 8, 14, 16, 18], "report": 2, "request": 2, "resourc": [], "respons": 1, "return": [6, 7, 8, 10], "right": 18, "scope": 1, "share": 14, "should": 18, "stage": 18, "standard": 1, "structur": [2, 7], "style": 2, "support": [4, 5, 6, 9], "synthet": [6, 16], "task": 10, "temporari": 1, "test": 2, "text": [4, 18], "train": 12, "transform": 9, "two": 18, "unit": 2, "us": [16, 17], "util": 10, "v0": 0, "verif": 2, "via": 3, "visual": 10, "vocab": 6, "warn": 1, "what": 18, "word": 7, "your": [12, 14, 15, 16, 17], "zoo": [4, 8]}}) \ No newline at end of file diff --git a/v0.9.0/transforms.html b/v0.9.0/transforms.html deleted file mode 100644 index 85e94d8a76..0000000000 --- a/v0.9.0/transforms.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - - - - - - - - doctr.transforms - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.transforms

-

Data transformations are part of both training and inference procedure. Drawing inspiration from the design of torchvision, we express transformations as composable modules.

-
-

Supported transformations

-

Here are all transformations that are available through DocTR:

-
-
-class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
-

Resizes a tensor to a target size

-
-
Example::
>>> from doctr.transforms import Resize
->>> import tensorflow as tf
->>> transfo = Resize((32, 32))
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • output_size – expected output size

  • -
  • method – interpolation method

  • -
  • preserve_aspect_ratio – if True, preserve aspect ratio and pad the rest with zeros

  • -
  • symmetric_pad – if True while preserving aspect ratio, the padding will be done symmetrically

  • -
-
-
-
- -
-
-class doctr.transforms.Normalize(mean: Tuple[float, float, float], std: Tuple[float, float, float])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • mean – average value per channel

  • -
  • std – standard deviation per channel

  • -
-
-
-
- -
-
-class doctr.transforms.LambdaTransformation(fn: Callable[[Tensor], Tensor])[source]
-

Normalize a tensor to a Gaussian distribution for each channel

-
-
Example::
>>> from doctr.transforms import LambdaTransformation
->>> import tensorflow as tf
->>> transfo = LambdaTransformation(lambda x: x/ 255.)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

fn – the function to be applied to the input tensor

-
-
-
- -
-
-class doctr.transforms.ToGray[source]
-

Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ToGray()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
- -
-
-class doctr.transforms.ColorInversion(min_val: float = 0.5)[source]
-

Applies the following tranformation to a tensor (image or batch of images): -convert to grayscale, colorize (shift 0-values randomly), and then invert colors

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = ColorInversion(min_val=0.6)
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

min_val – range [min_val, 1] to colorize RGB pixels

-
-
-
- -
-
-class doctr.transforms.RandomBrightness(max_delta: float = 0.3)[source]
-

Randomly adjust brightness of a tensor (batch of images or image) by adding a delta -to all pixels

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Brightness()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

  • -
  • p – probability to apply transformation

  • -
-
-
-
- -
-
-class doctr.transforms.RandomContrast(delta: float = 0.3)[source]
-

Randomly adjust contrast of a tensor (batch of images or image) by adjusting -each pixel: (img - mean) * contrast_factor + mean.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Contrast()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomSaturation(delta: float = 0.5)[source]
-

Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and -increasing saturation by a factor.

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Saturation()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-

delta – multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1)

-
-
-
- -
-
-class doctr.transforms.RandomHue(max_delta: float = 0.3)[source]
-

Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Hue()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

max_delta – offset to add to each pixel is randomly picked in [-max_delta, max_delta]

-
-
-
- -
-
-class doctr.transforms.RandomGamma(min_gamma: float = 0.5, max_gamma: float = 1.5, min_gain: float = 0.8, max_gain: float = 1.2)[source]
-

randomly performs gamma correction for a tensor (batch of images or image)

-

Example

-
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = Gamma()
->>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1))
-
-
-
-
Parameters:
-
    -
  • min_gamma – non-negative real number, lower bound for gamma param

  • -
  • max_gamma – non-negative real number, upper bound for gamma

  • -
  • min_gain – lower bound for constant multiplier

  • -
  • max_gain – upper bound for constant multiplier

  • -
-
-
-
- -
-
-class doctr.transforms.RandomJpegQuality(min_quality: int = 60, max_quality: int = 100)[source]
-

Randomly adjust jpeg quality of a 3 dimensional RGB image

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = JpegQuality()
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • min_quality – int between [0, 100]

  • -
  • max_quality – int between [0, 100]

  • -
-
-
-
- -
-
-

Composing transformations

-

It is common to require several transformations to be performed consecutively.

-
-
-class doctr.transforms.Compose(transforms: List[Callable[[Any], Any]])[source]
-

Implements a wrapper that will apply transformations sequentially

-
-
Example::
>>> from doctr.transforms import Compose, Resize
->>> import tensorflow as tf
->>> transfos = Compose([Resize((32, 32))])
->>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformation modules

-
-
-
- -
-
-class doctr.transforms.OneOf(transforms: List[Callable[[Any], Any]])[source]
-

Randomly apply one of the input transformations

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = OneOf([JpegQuality(), Gamma()])
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-

transforms – list of transformations, one only will be picked

-
-
-
- -
-
-class doctr.transforms.RandomApply(transform: Callable[[Any], Any], p: float = 0.5)[source]
-

Apply with a probability p the input transformation

-
-
Example::
>>> from doctr.transforms import Normalize
->>> import tensorflow as tf
->>> transfo = RandomApply(Gamma(), p=.5)
->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
-
-
-
-
-
-
Parameters:
-
    -
  • transform – transformation to apply

  • -
  • p – probability to apply

  • -
-
-
-
- -
-
- -
-
- -
- -
-
- - - - - - - - \ No newline at end of file diff --git a/v0.9.0/using_doctr/custom_models_training.html b/v0.9.0/using_doctr/custom_models_training.html index 2c79fb64b5..6ff20087da 100644 --- a/v0.9.0/using_doctr/custom_models_training.html +++ b/v0.9.0/using_doctr/custom_models_training.html @@ -14,7 +14,7 @@ - + Train your own model - docTR documentation @@ -547,7 +547,7 @@

Loading your custom trained model - + diff --git a/v0.9.0/using_doctr/running_on_aws.html b/v0.9.0/using_doctr/running_on_aws.html index 2815ec9e23..09c81cbd17 100644 --- a/v0.9.0/using_doctr/running_on_aws.html +++ b/v0.9.0/using_doctr/running_on_aws.html @@ -14,7 +14,7 @@ - + AWS Lambda - docTR documentation @@ -358,7 +358,7 @@

AWS Lambda - + diff --git a/v0.9.0/using_doctr/sharing_models.html b/v0.9.0/using_doctr/sharing_models.html index b1e1d03f67..a499929f5f 100644 --- a/v0.9.0/using_doctr/sharing_models.html +++ b/v0.9.0/using_doctr/sharing_models.html @@ -14,7 +14,7 @@ - + Share your model with the community - docTR documentation @@ -540,7 +540,7 @@

Recognition - + diff --git a/v0.9.0/using_doctr/using_contrib_modules.html b/v0.9.0/using_doctr/using_contrib_modules.html index 7985749cdf..ae88bcf382 100644 --- a/v0.9.0/using_doctr/using_contrib_modules.html +++ b/v0.9.0/using_doctr/using_contrib_modules.html @@ -14,7 +14,7 @@ - + Integrate contributions into your pipeline - docTR documentation @@ -411,7 +411,7 @@

ArtefactDetection - + diff --git a/v0.9.0/using_doctr/using_datasets.html b/v0.9.0/using_doctr/using_datasets.html index 1b309f8041..c44fa8bb73 100644 --- a/v0.9.0/using_doctr/using_datasets.html +++ b/v0.9.0/using_doctr/using_datasets.html @@ -14,7 +14,7 @@ - + Choose a ready to use dataset - docTR documentation @@ -625,7 +625,7 @@

Data Loading - + diff --git a/v0.9.0/using_doctr/using_model_export.html b/v0.9.0/using_doctr/using_model_export.html index d24c97bba7..a8c3258f53 100644 --- a/v0.9.0/using_doctr/using_model_export.html +++ b/v0.9.0/using_doctr/using_model_export.html @@ -14,7 +14,7 @@ - + Preparing your model for inference - docTR documentation @@ -463,7 +463,7 @@

Using your ONNX exported model - + diff --git a/v0.9.0/using_doctr/using_models.html b/v0.9.0/using_doctr/using_models.html index f7a717bff1..e638938baa 100644 --- a/v0.9.0/using_doctr/using_models.html +++ b/v0.9.0/using_doctr/using_models.html @@ -14,7 +14,7 @@ - + Choosing the right model - docTR documentation @@ -1229,7 +1229,7 @@

Advanced options - + diff --git a/v0.9.0/utils.html b/v0.9.0/utils.html deleted file mode 100644 index e2f223f06a..0000000000 --- a/v0.9.0/utils.html +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - doctr.utils - docTR documentation - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
- -
-
-
- - - - - Back to top - -
- -
- -
- -
-
-
-

doctr.utils

-

This module regroups non-core features that are complementary to the rest of the package.

-
-

Visualization

-

Easy-to-use functions to make sense of your model’s predictions.

-
-
-doctr.utils.visualization.visualize_page(page: Dict[str, Any], image: ndarray, words_only: bool = True, display_artefacts: bool = True, scale: float = 10, interactive: bool = True, add_labels: bool = True, **kwargs: Any) Figure[source]
-

Visualize a full page with predicted blocks, lines and words

-
-
Example::
>>> import numpy as np
->>> import matplotlib.pyplot as plt
->>> from doctr.utils.visualization import visualize_page
->>> from doctr.models import ocr_db_crnn
->>> model = ocr_db_crnn(pretrained=True)
->>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
->>> out = model([[input_page]])
->>> visualize_page(out[0].pages[0].export(), input_page)
->>> plt.show()
-
-
-
-
-
-
Parameters:
-
    -
  • page – the exported Page of a Document

  • -
  • image – np array of the page, needs to have the same shape than page[‘dimensions’]

  • -
  • words_only – whether only words should be displayed

  • -
  • display_artefacts – whether artefacts should be displayed

  • -
  • scale – figsize of the largest windows side

  • -
  • interactive – whether the plot should be interactive

  • -
  • add_labels – for static plot, adds text labels on top of bounding box

  • -
-
-
-
- -
-
-

Task evaluation

-

Implementations of task-specific metrics to easily assess your model performances.

-
-
-class doctr.utils.metrics.TextMatch[source]
-

Implements text match metric (word-level accuracy) for recognition task.

-

The raw aggregated metric is computed as follows:

-
-
-\[\forall X, Y \in \mathcal{W}^N, -TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i)\]
-
-

with the indicator function \(f_{a}\) defined as:

-
-
-\[\begin{split}\forall a, x \in \mathcal{W}, -f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{W}\) is the set of all possible character sequences, -\(N\) is a strictly positive integer.

-
-
Example::
>>> from doctr.utils import TextMatch
->>> metric = TextMatch()
->>> metric.update(['Hello', 'world'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
-summary() Dict[str, float][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

-
-
-
- -
- -
-
-class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements common confusion metrics and mean IoU for localization evaluation.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ -Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(g_{X}\) defined as:

-
-
-\[\begin{split}\forall y \in \mathcal{B}, -g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import LocalizationConfusion
->>> metric = LocalizationConfusion(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]))
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[float | None, float | None, float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall, precision and meanIoU scores

-
-
-
- -
- -
-
-class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
-

Implements end-to-end OCR metric.

-

The aggregated metrics are computed as follows:

-
-
-\[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, -\forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ -Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
-
-

with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, L}\) defined as:

-
-
-\[\begin{split}\forall (b, l) \in \mathcal{B} \times \mathcal{L}, -h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
-
-

where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{L}\) is the set of possible character sequences, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

-
-
Example::
>>> import numpy as np
->>> from doctr.utils import OCRMetric
->>> metric = OCRMetric(iou_thresh=0.5)
->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
-['hello'], ['hello', 'world'])
->>> metric.summary()
-
-
-
-
-
-
Parameters:
-

iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

-
-
-
-
-summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
-

Computes the aggregated metrics

-
-
Returns:
-

a tuple with the recall & precision for each string comparison flexibility and the mean IoU

-
-
-
- -
- -
-
- -
-
- -
- -
-
- - - - - - - - - \ No newline at end of file