Added Visual Form Relation Extractor

JohnSnowLabs · May 5, 2024 · 6efe108 · 6efe108
1 parent 69c9452
commit 6efe108
Show file tree

Hide file tree

Showing 20 changed files with 771 additions and 5 deletions.
diff --git a/examples/colab/ocr/ocr_form_relation.ipynb b/examples/colab/ocr/ocr_form_relation.ipynb
diff --git a/examples/colab/ocr/table_extraction.ipynb b/examples/colab/ocr/table_extraction.ipynb
@@ -2752,4 +2752,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
+}
diff --git a/nlu/ocr_components/form_relation_extractor/__init__.py b/nlu/ocr_components/form_relation_extractor/__init__.py
diff --git a/nlu/ocr_components/form_relation_extractor/form_relation_extractor.py b/nlu/ocr_components/form_relation_extractor/form_relation_extractor.py
@@ -0,0 +1,8 @@
+
+class FormRelationExtractor:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import FormRelationExtractor
+        return FormRelationExtractor() \
+            .setInputCol("text_entity") \
+            .setOutputCol("ocr_relations")
diff --git a/nlu/pipe/col_substitution/col_substitution_OCR.py b/nlu/pipe/col_substitution/col_substitution_OCR.py
@@ -96,3 +96,13 @@ def substitute_document_ner_cols(c, cols, nlu_identifier):
         # new_cols[c] = c
     return new_cols
 
+def substitute_form_extractor_text_cols(c, cols, is_unique=True, nlu_identifier=''):
+    new_cols = {}
+    for c in cols:
+        if 'meta_visual_classifier_prediction_entity1' in c:
+            new_cols['meta_visual_classifier_prediction_entity1'] = 'form_relation_prediction_key'
+        if 'meta_visual_classifier_prediction_entity2' in c:
+            new_cols['meta_visual_classifier_prediction_entity2'] = 'form_relation_prediction_value'
+        # if 'path' in c:
+        #     new_cols['path'] = 'file_path'
+    return new_cols
diff --git a/nlu/pipe/col_substitution/substitution_map_OCR.py b/nlu/pipe/col_substitution/substitution_map_OCR.py
@@ -20,5 +20,8 @@
     VisualDocumentNerLilt : {
         'default': substitute_document_ner_cols,
     },
+    FormRelationExtractor : {
+        'default': substitute_form_extractor_text_cols,
+    }
 
 }
diff --git a/nlu/pipe/extractors/extractor_configs_OCR.py b/nlu/pipe/extractors/extractor_configs_OCR.py
@@ -28,6 +28,16 @@ def default_visual_classifier_config(output_col_prefix='visual_classifier'):
         description='Gets label and confidence of visual classifier',
     )
 
+def default_form_relation_extractor_config(output_col_prefix='extracted_relations'):
+    return SparkOCRExtractorConfig(
+        output_col_prefix=output_col_prefix,
+        get_result=True,
+        get_full_meta=True,
+        name='full_relation_extraction',
+        description='Get relation extraction result and all metadata, with positions of entities',
+    )
+
+
 def default_visual_ner_config(output_col_prefix='visual_ocr'):
     return SparkOCRExtractorConfig(
         get_text=True,

diff --git a/nlu/spellbook.py b/nlu/spellbook.py
@@ -11324,6 +11324,7 @@ class Spellbook:
         'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482',
         'en.image_table_detector':'general_model_table_detection_v2',
         'en.lilt_roberta_funds.v1': 'lilt_roberta_funsd_v1',
+        'visual_form_relation_extractor': OCR_NODE_IDS.FORM_RELATION_EXTRACTOR,
     }
 
     # ocr_model_references = {
@@ -16299,7 +16300,8 @@ class Spellbook:
                              'general_model_table_detection_v2': 'ImageTableDetector',
                              'image_table_cell_detector': 'ImageTableCellDetector',
                              'image_table_cell2text_table': 'ImageCellsToTextTable',
-                            'lilt_roberta_funsd_v1': 'VisualDocumentNer',
+                             'visual_form_relation_extractor':'FormRelationExtractor',
+                             'lilt_roberta_funsd_v1': 'VisualDocumentNer',
                              'instructor_large':'InstructorEmbeddings',
                              'instructor_base':'InstructorEmbeddings',
                              'initial_model': 'MPNetEmbeddings',

diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py
@@ -317,6 +317,7 @@ class AnnoClassRef:
         OCR_NODE_IDS.IMAGE_SPLIT_REGIONS: 'ImageSplitRegions',
         OCR_NODE_IDS.VISUAL_DOCUMENT_NER: 'VisualDocumentNer',
         OCR_NODE_IDS.HOCR_TOKENIZER: 'HocrTokenizer',
+        OCR_NODE_IDS.FORM_RELATION_EXTRACTOR:  'FormRelationExtractor',
     }
 
     @staticmethod

diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py
@@ -149,17 +149,18 @@
 from nlu.ocr_components.table_extractors.image2table.image2table import IMAGE_TABLE_DETECTOR
 from nlu.ocr_components.visual_ner.visual_document_ner.visual_document_ner import VisualDocumentNer
 from nlu.ocr_components.table_extractors.image2table_cell.image2table_cell import ImageTableCellDetector
+from nlu.ocr_components.form_relation_extractor.form_relation_extractor import FormRelationExtractor
 from nlu.ocr_components.table_extractors.image_table_cell2text.image_table_cell2text import ImageTable2Cell2TextTable
 from nlu.ocr_components.utils.image_split_regions.image_split_regions import ImageSplitRegions
 # from nlu.ocr_components.visual_classifiers.visual_doc_classifier.visual_doc_classifier import VisualDocClassifier
 from nlu.pipe.col_substitution.col_substitution_HC import *
 from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols, \
-    substitute_document_classifier_text_cols
+    substitute_document_classifier_text_cols, substitute_form_extractor_text_cols
 from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols,substitute_document_ner_cols
 from nlu.pipe.col_substitution.col_substitution_OS import *
 from nlu.pipe.extractors.extractor_configs_HC import *
 from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, \
-    default_visual_classifier_config
+    default_visual_classifier_config,default_form_relation_extractor_config
 from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, default_visual_ner_config
 from nlu.pipe.extractors.extractor_configs_OS import *
 from nlu.pipe.nlu_component import NluComponent
@@ -4519,4 +4520,25 @@ class ComponentUniverse:
                                          applicable_file_types=['JPG', 'JPEG']
                                          ),
 
+        O_A.FORM_RELATION_EXTRACTOR: partial(NluComponent,
+                                             name=O_A.FORM_RELATION_EXTRACTOR,
+                                             type=T.TEXT_RECOGNIZER,
+                                             get_default_model=FormRelationExtractor.get_default_model,
+                                             # TODO EXtractor0
+                                             pdf_extractor_methods={'default': default_form_relation_extractor_config},
+                                             # TODO substitor
+                                             pdf_col_name_substitutor=substitute_form_extractor_text_cols,
+                                             output_level=L.RELATION,
+                                             node=OCR_FEATURE_NODES.nodes[O_A.FORM_RELATION_EXTRACTOR],
+                                             description='Convert text to PDF file',
+                                             provider=ComponentBackends.ocr,
+                                             license=Licenses.ocr,
+                                             computation_context=ComputeContexts.spark,
+                                             output_context=ComputeContexts.spark,
+                                             jsl_anno_class_id=O_A.FORM_RELATION_EXTRACTOR,
+                                             jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[
+                                                 O_A.FORM_RELATION_EXTRACTOR],
+                                             applicable_file_types=['DOCX', 'DOC'],
+                                             ),
+
     }
diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py
@@ -319,6 +319,8 @@ class OCR_NODE_IDS:
     """All available Feature nodes in OCR
     Used to cast the pipeline dependency resolution algorithm into an abstract graph
     """
+
+    FORM_RELATION_EXTRACTOR = JslAnnoId('visual_form_relation_extractor')
     # Visual Document Understanding
     VISUAL_DOCUMENT_CLASSIFIER = JslAnnoId('visual_document_classifier')
     VISUAL_DOCUMENT_NER = JslAnnoId('visual_document_ner')

diff --git a/nlu/universe/feature_node_universes.py b/nlu/universe/feature_node_universes.py
@@ -301,6 +301,9 @@ class OCR_FEATURE_NODES:
     A = OCR_NODE_IDS
     F = OCR_FEATURES
     nodes = {
+        A.FORM_RELATION_EXTRACTOR: OcrFeatureNode(A.FORM_RELATION_EXTRACTOR, [F.TEXT_ENTITY],
+                                                  [F.VISUAL_RELATION]),
+
         A.VISUAL_DOCUMENT_CLASSIFIER: OcrFeatureNode(A.VISUAL_DOCUMENT_CLASSIFIER, [F.HOCR],
                                                      [F.VISUAL_CLASSIFIER_PREDICTION, F.VISUAL_CLASSIFIER_CONFIDENCE, F.FILE_PATH]),
 

diff --git a/nlu/universe/feature_resolutions.py b/nlu/universe/feature_resolutions.py
@@ -116,5 +116,6 @@ class FeatureResolutions:
                                                       ComponentUniverse.components[OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR]),
         OCR_FEATURES.TEXT_DOCUMENT_TOKENIZED: ResolvedFeature(OCR_NODE_IDS.HOCR_TOKENIZER, OCR_NODE_IDS.HOCR_TOKENIZER, 'xx', False,
                                            ComponentUniverse.components[OCR_NODE_IDS.HOCR_TOKENIZER]),
-
+        OCR_FEATURES.TEXT_ENTITY: ResolvedFeature(OCR_NODE_IDS.VISUAL_DOCUMENT_NER, OCR_NODE_IDS.VISUAL_DOCUMENT_NER,
+                                            'xx', False,ComponentUniverse.components[OCR_NODE_IDS.VISUAL_DOCUMENT_NER]),
     }
diff --git a/nlu/universe/feature_universes.py b/nlu/universe/feature_universes.py
@@ -129,7 +129,9 @@ class OCR_FEATURES(JslFeature):
     PREDICTION_CONFIDENCE = JslFeature("prediction_confidence")  # TODO is this just int or some struct?
     VISUAL_CLASSIFIER_CONFIDENCE = JslFeature("visual_classifier_confidence")
     VISUAL_CLASSIFIER_PREDICTION = JslFeature("visual_classifier_prediction")
+    VISUAL_RELATION = JslFeature("visual_classifier_prediction")
 
+    FORM_RELATION = JslFeature('ocr_relations')
 
 class NLP_HC_FEATURES(JslFeature):
     """

diff --git a/tests/nlu_ocr_tests/cv_test.png → tests/datasets/ocr/images/cv_test.png b/tests/nlu_ocr_tests/cv_test.png → tests/datasets/ocr/images/cv_test.png
diff --git a/tests/datasets/ocr/images/form.png b/tests/datasets/ocr/images/form.png
diff --git a/tests/datasets/ocr/images/form2.png b/tests/datasets/ocr/images/form2.png
diff --git a/tests/nlu_ocr_tests/letter.jpg → tests/datasets/ocr/images/letter.jpg b/tests/nlu_ocr_tests/letter.jpg → tests/datasets/ocr/images/letter.jpg
diff --git a/tests/nlu_ocr_tests/ocr_form_relation_extractor.py b/tests/nlu_ocr_tests/ocr_form_relation_extractor.py
@@ -0,0 +1,27 @@
+import os
+import sys
+
+sys.path.append(os.getcwd())
+import unittest
+import nlu
+
+os.environ["PYTHONPATH"] = "F:/Work/repos/nlu"
+os.environ['PYSPARK_PYTHON'] = sys.executable
+os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
+from johnsnowlabs import nlp, visual
+
+# nlp.install(json_license_path='license.json',visual=True)
+nlp.start(visual=True)
+
+class OcrTest(unittest.TestCase):
+
+    def test_classify_document(self):
+        # nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
+        # text that we generate PDF to has to come from an image struct!
+        # We need convert text to img struct!
+        p = nlu.load('visual_form_relation_extractor').predict(['tests/datasets/ocr/form.png','tests/datasets/ocr/form2.png'])
+        for df in p:
+            print(p.to_markdown())
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/nlu_ocr_tests/ocr_ner.png b/tests/nlu_ocr_tests/ocr_ner.png