Added Visual Document NER

JohnSnowLabs · Feb 27, 2024 · 32609ec · 32609ec
1 parent 6523f6c
commit 32609ec
Show file tree

Hide file tree

Showing 23 changed files with 1,228 additions and 52 deletions.
diff --git a/examples/colab/ocr/ocr_visual_document_ner.ipynb b/examples/colab/ocr/ocr_visual_document_ner.ipynb
diff --git a/nlu/components/embeddings/sentence_mpnet/MPNetSentenceEmbedding.py b/nlu/components/embeddings/sentence_mpnet/MPNetSentenceEmbedding.py
@@ -1,18 +1,18 @@
-from sparknlp.annotator import MPNetEmbeddings
-
-
-class MPNetSentence:
-    @staticmethod
-    def get_default_model():
-        return MPNetEmbeddings.pretrained() \
-            .setInputCols(["documents"]) \
-            .setOutputCol("mpnet_embeddings")
-
-    @staticmethod
-    def get_pretrained_model(name, language, bucket=None):
-        return MPNetEmbeddings.pretrained(name,language,bucket) \
-            .setInputCols(["documents"]) \
-            .setOutputCol("mpnet_embeddings")
-
-
-
+# from sparknlp.annotator import MPNetEmbeddings
+#
+#
+# class MPNetSentence:
+#     @staticmethod
+#     def get_default_model():
+#         return MPNetEmbeddings.pretrained() \
+#             .setInputCols(["documents"]) \
+#             .setOutputCol("mpnet_embeddings")
+#
+#     @staticmethod
+#     def get_pretrained_model(name, language, bucket=None):
+#         return MPNetEmbeddings.pretrained(name,language,bucket) \
+#             .setInputCols(["documents"]) \
+#             .setOutputCol("mpnet_embeddings")
+#
+#
+#
diff --git a/nlu/ocr_components/utils/hocr_tokenizer/__init__.py b/nlu/ocr_components/utils/hocr_tokenizer/__init__.py
diff --git a/nlu/ocr_components/utils/hocr_tokenizer/hocr_tokenizer.py b/nlu/ocr_components/utils/hocr_tokenizer/hocr_tokenizer.py
@@ -0,0 +1,7 @@
+class HocrTokenizer:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import HocrTokenizer
+        return HocrTokenizer() \
+            .setInputCol("hocr") \
+            .setOutputCol("text_tokenized")
diff --git a/nlu/ocr_components/visual_ner/__init__.py b/nlu/ocr_components/visual_ner/__init__.py
diff --git a/nlu/ocr_components/visual_ner/visual_document_ner/__init__.py b/nlu/ocr_components/visual_ner/visual_document_ner/__init__.py
diff --git a/nlu/ocr_components/visual_ner/visual_document_ner/visual_document_ner.py b/nlu/ocr_components/visual_ner/visual_document_ner/visual_document_ner.py
@@ -0,0 +1,8 @@
+class VisualDocumentNer:
+    @staticmethod
+    def get_default_model():
+        from sparkocr.transformers import VisualDocumentNer
+        return VisualDocumentNer()\
+            .pretrained("lilt_roberta_funsd_v1", "en", "clinical/ocr")\
+            .setInputCols(["text_tokenized", "image"])\
+            .setOutputCol("text_entity")
diff --git a/nlu/pipe/col_substitution/col_name_substitution_utils.py b/nlu/pipe/col_substitution/col_name_substitution_utils.py
@@ -13,7 +13,7 @@
 from sparknlp.annotator import *
 
 import nlu
-from nlu.pipe.col_substitution import substitution_map_OS
+from nlu.pipe.col_substitution import substitution_map_OS, substitution_map_OCR
 from nlu.pipe.extractors.extractor_base_data_classes import SparkOCRExtractorConfig
 from nlu.universe.feature_universes import NLP_FEATURES
 from nlu.universe.logic_universes import AnnoTypes
@@ -73,7 +73,13 @@ def substitute_col_names(df, anno_2_ex, pipe, stranger_cols=[], get_embeddings=F
                 anno2final_cols[c.model] = list(old2new_anno_cols.values())
                 new_cols.update(old2new_anno_cols)
                 new_cols = {**new_cols, **(old2new_anno_cols)}
-                continue
+                if type(c.model) in substitution_map_OCR.OCR_anno2substitution_fn.keys():
+                    cols = df.columns.tolist()
+                    substitution_fn = substitution_map_OCR.OCR_anno2substitution_fn[type(c.model)]['default']
+                    old2new_anno_cols = substitution_fn(c, cols, deducted_component_names[c])
+                    anno2final_cols[c.model] = list(old2new_anno_cols.values())
+                    new_cols = {**new_cols, **(old2new_anno_cols)}
+                    continue
             if 'embedding' in c.type and get_embeddings == False: continue
             cols_to_substitute = ColSubstitutionUtils.get_final_output_cols_of_component(c, df, anno_2_ex)
             if len(cols_to_substitute) == 0:
@@ -126,7 +132,26 @@ def get_final_output_cols_of_component(c, df, anno_2_ex) -> List[str]:
         result_cols = []
         if isinstance(configs, SparkOCRExtractorConfig):
             # TODO better OCR-EX handling --> Col Name generator function which we use everywhere for unified col naming !!!!!
-            return ['text']
+            # return ['text']
+            for col in df.columns:
+                if 'meta_' + configs.output_col_prefix in col:
+                    base_meta_prefix = 'meta_' + configs.output_col_prefix
+                    meta_col_name = base_meta_prefix + col.split(base_meta_prefix)[-1]
+                    if meta_col_name in df.columns:
+                        # special case for overlapping names with _
+                        if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and not \
+                                c.spark_output_column_names[0].split('_')[-1].isnumeric(): continue
+                        if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and \
+                                c.spark_output_column_names[0].split('_')[-1].isnumeric():
+                            id1 = int(col.split(base_meta_prefix)[-1].split('_')[1])
+                            id2 = int(c.spark_output_column_names.split('_')[-1])
+                            if id1 != id2: continue
+                        result_cols.append(meta_col_name)
+                    elif c.type == AnnoTypes.CHUNK_CLASSIFIER:
+                        result_cols.append(col)
+                    else:
+                        logger.info(f"Could not find meta col for os_components={c}, col={col}. Ommiting col..")
+            return result_cols
         if isinstance(c.model, MultiDocumentAssembler):
             return [f'{NLP_FEATURES.DOCUMENT_QUESTION}_results', f'{NLP_FEATURES.DOCUMENT_QUESTION_CONTEXT}_results']
 

diff --git a/nlu/pipe/col_substitution/col_substitution_OCR.py b/nlu/pipe/col_substitution/col_substitution_OCR.py
@@ -31,3 +31,46 @@ def substitute_recognized_text_cols(c, cols, is_unique=True, nlu_identifier=''):
     #     else : logger.info(f'Dropping unmatched metadata_col={col} for c={c}')
     #     # new_cols[col]= f"{new_base_name}_confidence"
     # return new_cols
+def substitute_document_classifier_text_cols(c, cols, is_unique=True, nlu_identifier=''):
+    """
+    Drug Norm is always unique
+    Fetched fields are:
+    - entities@<storage_ref>_results
+    - entities@<storage_ref>_<metadata>
+        - entities@<storage_ref>_entity
+        - entities@<storage_ref>_confidence
+    """
+    new_cols = {}
+    for c in cols:
+        if 'visual_classifier_label.1' in cols:
+            new_cols['visual_classifier_label.1'] = 'file_path'
+        if 'visual_classifier_label' in cols:
+            new_cols['visual_classifier_label'] = 'visual_classifier_prediction'
+
+        new_cols[c] = c
+    return new_cols  # TODO
+
+def substitute_document_ner_cols(c, cols, nlu_identifier):
+    """
+    Drug Norm is always unique
+    Fetched fields are:
+    - entities@<storage_ref>_results
+    - entities@<storage_ref>_<metadata>
+        - entities@<storage_ref>_entity
+        - entities@<storage_ref>_confidence
+    """
+    new_cols = {}
+    new_base_name = 'entities' if nlu_identifier == 'UNIQUE' else f'entities_{nlu_identifier}'
+    for c in cols:
+        if '_ocr_confidence' in c:
+            new_cols['meta_text_entity_confidence'] = f'{new_base_name}_confidence'
+        if '_token' in c:
+            new_cols['meta_text_entity_token'] = f'{new_base_name}_ner_entity'
+        if '_entity_x' in c:
+            new_cols['meta_text_entity_x'] = f'{new_base_name}_x_location'
+        if '_entity_y' in c:
+            new_cols['meta_text_entity_y'] = f'{new_base_name}_y_location'
+
+        # new_cols[c] = c
+    return new_cols
+
diff --git a/nlu/pipe/col_substitution/substitution_map_OCR.py b/nlu/pipe/col_substitution/substitution_map_OCR.py
@@ -0,0 +1,24 @@
+"""
+Resolve Annotator Classes in the Pipeline to Extractor Configs and Methods
+Every Annotator should have 2 configs. Some might offor multuple configs/method pairs, based on model_anno_obj/NLP reference.
+- default/minimalistic -> Just the results of the annotations, no confidences or extra metadata
+- with meta            -> A config that leverages white/black list and gets the most relevant metadata
+- with positions       -> With Begins/Ends
+- with sentence references -> Reeturn the sentence/chunk no. reference from the metadata.
+                                If a document has multi-sentences, this will map a label back to a corrosponding sentence
+"""
+# from nlu.pipe.col_substitution.col_substitution_HC import *
+from nlu.pipe.col_substitution.col_substitution_OS import *
+from nlu.pipe.col_substitution.col_substitution_OCR import *
+
+from sparkocr.transformers import *
+
+OCR_anno2substitution_fn = {
+     VisualDocumentClassifier : {
+        'default': substitute_document_classifier_text_cols ,
+    },
+    VisualDocumentNerLilt : {
+        'default': substitute_document_ner_cols,
+    },
+
+}
diff --git a/nlu/pipe/col_substitution/substitution_map_OS.py b/nlu/pipe/col_substitution/substitution_map_OS.py
@@ -73,9 +73,9 @@
     SentenceEmbeddings: {
         'default': substitute_sent_embed_cols,
     },
-    MPNetEmbeddings: {
-        'default': substitute_sent_embed_cols,
-    },
+    # MPNetEmbeddings: {
+    #     'default': substitute_sent_embed_cols,
+    # },
     Tokenizer: {
         'default': substitute_tokenizer_cols,
     },

diff --git a/nlu/pipe/extractors/extractor_base_data_classes.py b/nlu/pipe/extractors/extractor_base_data_classes.py
@@ -142,7 +142,7 @@ class SparkOCRExtractorConfig(SparkNLPExtractorConfig):
     get_image_resolution: bool = field(default=False)
     get_image_data: bool = field(default=False)
     # General OCR fields
-    # get_path          :bool              = field(default = False)# origin is path
+    get_path: bool = field(default=False)# origin is path
     get_modification_time: bool = field(default=False)
     get_length: bool = field(default=False)
     get_page_num: bool = field(default=False)

diff --git a/nlu/pipe/extractors/extractor_configs_OCR.py b/nlu/pipe/extractors/extractor_configs_OCR.py
@@ -28,6 +28,22 @@ def default_visual_classifier_config(output_col_prefix='visual_classifier'):
         description='Gets label and confidence of visual classifier',
     )
 
+def default_visual_ner_config(output_col_prefix='visual_ocr'):
+    return SparkOCRExtractorConfig(
+        get_text=True,
+        get_begin=True,
+        get_end=True,
+        get_result=True,
+        get_meta=True,
+        get_full_meta=True,
+        get_image_data=True,
+        get_path=True,
+        get_annotator_type=False,
+        output_col_prefix=output_col_prefix,
+        meta_white_list=['entity', 'confidence', 'sentence', 'chunk'],
+        name='visual_ner label, confidence and entities ',
+        description='Gets label, entities and confidence of visual ner',
+    )
 
 def default_binary_to_image_config(output_col_prefix='binary_image'):
     return SparkOCRExtractorConfig(

diff --git a/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py b/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py
@@ -80,11 +80,15 @@ def extract_base_sparkocr_features(row: pd.Series, configs: SparkOCRExtractorCon
         else:
             return {'visual_classifier_confidence': row}
 
+    # if 'FULL binary to image extractor ' in configs.name:
+    #     if not isinstance(row, str):
+    #         return {'path': row}
+
+
     else:
         # # OCR unpackers (TODO WIP)
         # unpack_text = lambda x: unpack_dict_list(x, 'text')
-        # # unpack_image = lambda x : unpack_dict_list(x, 'TODO') # is data?
-        # unpack_image_origin = lambda x: unpack_dict_list(x, 'origin')
+        # # unpack_image = lambda x : unpack_dict_list(x, 'TODO') # is data?       # unpack_image_origin = lambda x: unpack_dict_list(x, 'origin')
         # unpack_image_height = lambda x: unpack_dict_list(x, 'height')
         # unpack_image_width = lambda x: unpack_dict_list(x, 'width')
         # unpack_image_n_channels = lambda x: unpack_dict_list(x, 'nChannels')
@@ -317,6 +321,8 @@ def apply_extractors_and_merge(df, anno_2_ex_config, keep_stranger_features, str
     extractor = lambda c: df[c].apply(extract_master, configs=anno_2_ex_config[c])
     keep_strangers = lambda c: df[c]
 
+    stranger_features.append('path') if 'path' in df.columns and 'text_entity' in anno_2_ex_config.keys() else None
+
     # merged_extraction_df
     # apply the extract_master together with it's configs to every column and geenrate a list of output DF's, one per Spark NLP COL
     # TODO handle MULTI-COL-OUTPUT. If Anno has multi cols, then we either needs multiple keys in anno_2_ex or use something besides

diff --git a/nlu/spellbook.py b/nlu/spellbook.py
@@ -11303,7 +11303,8 @@ class Spellbook:
         'ppt2table': OCR_NODE_IDS.PPT2TEXT_TABLE,
         'classify.image': OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER,
         'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482',
-        'en.image_table_detector':'general_model_table_detection_v2'
+        'en.image_table_detector':'general_model_table_detection_v2',
+        'en.lilt_roberta_funds.v1': 'lilt_roberta_funsd_v1',
     }
 
     # ocr_model_references = {
@@ -16273,6 +16274,7 @@ class Spellbook:
                              'general_model_table_detection_v2': 'ImageTableDetector',
                              'image_table_cell_detector': 'ImageTableCellDetector',
                              'image_table_cell2text_table': 'ImageCellsToTextTable',
+                            'lilt_roberta_funsd_v1': 'VisualDocumentNer',
                              'instructor_large':'InstructorEmbeddings',
                              'instructor_base':'InstructorEmbeddings',
                              'initial_model': 'MPNetEmbeddings',

diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py
@@ -305,6 +305,8 @@ class AnnoClassRef:
         OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR: 'ImageTableCellDetector',
         OCR_NODE_IDS.IMAGE_TABLE_CELL2TEXT_TABLE: 'ImageCellsToTextTable',
         OCR_NODE_IDS.IMAGE_SPLIT_REGIONS: 'ImageSplitRegions',
+        OCR_NODE_IDS.VISUAL_DOCUMENT_NER: 'VisualDocumentNer',
+        OCR_NODE_IDS.HOCR_TOKENIZER: 'HocrTokenizer',
     }
 
     @staticmethod