Skip to content

Commit

Permalink
Added Visual Document NER
Browse files Browse the repository at this point in the history
  • Loading branch information
gadde5300 committed Feb 27, 2024
1 parent 6523f6c commit 32609ec
Show file tree
Hide file tree
Showing 23 changed files with 1,228 additions and 52 deletions.
968 changes: 968 additions & 0 deletions examples/colab/ocr/ocr_visual_document_ner.ipynb

Large diffs are not rendered by default.

36 changes: 18 additions & 18 deletions nlu/components/embeddings/sentence_mpnet/MPNetSentenceEmbedding.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from sparknlp.annotator import MPNetEmbeddings


class MPNetSentence:
@staticmethod
def get_default_model():
return MPNetEmbeddings.pretrained() \
.setInputCols(["documents"]) \
.setOutputCol("mpnet_embeddings")

@staticmethod
def get_pretrained_model(name, language, bucket=None):
return MPNetEmbeddings.pretrained(name,language,bucket) \
.setInputCols(["documents"]) \
.setOutputCol("mpnet_embeddings")



# from sparknlp.annotator import MPNetEmbeddings
#
#
# class MPNetSentence:
# @staticmethod
# def get_default_model():
# return MPNetEmbeddings.pretrained() \
# .setInputCols(["documents"]) \
# .setOutputCol("mpnet_embeddings")
#
# @staticmethod
# def get_pretrained_model(name, language, bucket=None):
# return MPNetEmbeddings.pretrained(name,language,bucket) \
# .setInputCols(["documents"]) \
# .setOutputCol("mpnet_embeddings")
#
#
#
Empty file.
7 changes: 7 additions & 0 deletions nlu/ocr_components/utils/hocr_tokenizer/hocr_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class HocrTokenizer:
@staticmethod
def get_default_model():
from sparkocr.transformers import HocrTokenizer
return HocrTokenizer() \
.setInputCol("hocr") \
.setOutputCol("text_tokenized")
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
class VisualDocumentNer:
@staticmethod
def get_default_model():
from sparkocr.transformers import VisualDocumentNer
return VisualDocumentNer()\
.pretrained("lilt_roberta_funsd_v1", "en", "clinical/ocr")\
.setInputCols(["text_tokenized", "image"])\
.setOutputCol("text_entity")
31 changes: 28 additions & 3 deletions nlu/pipe/col_substitution/col_name_substitution_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sparknlp.annotator import *

import nlu
from nlu.pipe.col_substitution import substitution_map_OS
from nlu.pipe.col_substitution import substitution_map_OS, substitution_map_OCR
from nlu.pipe.extractors.extractor_base_data_classes import SparkOCRExtractorConfig
from nlu.universe.feature_universes import NLP_FEATURES
from nlu.universe.logic_universes import AnnoTypes
Expand Down Expand Up @@ -73,7 +73,13 @@ def substitute_col_names(df, anno_2_ex, pipe, stranger_cols=[], get_embeddings=F
anno2final_cols[c.model] = list(old2new_anno_cols.values())
new_cols.update(old2new_anno_cols)
new_cols = {**new_cols, **(old2new_anno_cols)}
continue
if type(c.model) in substitution_map_OCR.OCR_anno2substitution_fn.keys():
cols = df.columns.tolist()
substitution_fn = substitution_map_OCR.OCR_anno2substitution_fn[type(c.model)]['default']
old2new_anno_cols = substitution_fn(c, cols, deducted_component_names[c])
anno2final_cols[c.model] = list(old2new_anno_cols.values())
new_cols = {**new_cols, **(old2new_anno_cols)}
continue
if 'embedding' in c.type and get_embeddings == False: continue
cols_to_substitute = ColSubstitutionUtils.get_final_output_cols_of_component(c, df, anno_2_ex)
if len(cols_to_substitute) == 0:
Expand Down Expand Up @@ -126,7 +132,26 @@ def get_final_output_cols_of_component(c, df, anno_2_ex) -> List[str]:
result_cols = []
if isinstance(configs, SparkOCRExtractorConfig):
# TODO better OCR-EX handling --> Col Name generator function which we use everywhere for unified col naming !!!!!
return ['text']
# return ['text']
for col in df.columns:
if 'meta_' + configs.output_col_prefix in col:
base_meta_prefix = 'meta_' + configs.output_col_prefix
meta_col_name = base_meta_prefix + col.split(base_meta_prefix)[-1]
if meta_col_name in df.columns:
# special case for overlapping names with _
if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and not \
c.spark_output_column_names[0].split('_')[-1].isnumeric(): continue
if col.split(base_meta_prefix)[-1].split('_')[1].isnumeric() and \
c.spark_output_column_names[0].split('_')[-1].isnumeric():
id1 = int(col.split(base_meta_prefix)[-1].split('_')[1])
id2 = int(c.spark_output_column_names.split('_')[-1])
if id1 != id2: continue
result_cols.append(meta_col_name)
elif c.type == AnnoTypes.CHUNK_CLASSIFIER:
result_cols.append(col)
else:
logger.info(f"Could not find meta col for os_components={c}, col={col}. Ommiting col..")
return result_cols
if isinstance(c.model, MultiDocumentAssembler):
return [f'{NLP_FEATURES.DOCUMENT_QUESTION}_results', f'{NLP_FEATURES.DOCUMENT_QUESTION_CONTEXT}_results']

Expand Down
43 changes: 43 additions & 0 deletions nlu/pipe/col_substitution/col_substitution_OCR.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,46 @@ def substitute_recognized_text_cols(c, cols, is_unique=True, nlu_identifier=''):
# else : logger.info(f'Dropping unmatched metadata_col={col} for c={c}')
# # new_cols[col]= f"{new_base_name}_confidence"
# return new_cols
def substitute_document_classifier_text_cols(c, cols, is_unique=True, nlu_identifier=''):
"""
Drug Norm is always unique
Fetched fields are:
- entities@<storage_ref>_results
- entities@<storage_ref>_<metadata>
- entities@<storage_ref>_entity
- entities@<storage_ref>_confidence
"""
new_cols = {}
for c in cols:
if 'visual_classifier_label.1' in cols:
new_cols['visual_classifier_label.1'] = 'file_path'
if 'visual_classifier_label' in cols:
new_cols['visual_classifier_label'] = 'visual_classifier_prediction'

new_cols[c] = c
return new_cols # TODO

def substitute_document_ner_cols(c, cols, nlu_identifier):
"""
Drug Norm is always unique
Fetched fields are:
- entities@<storage_ref>_results
- entities@<storage_ref>_<metadata>
- entities@<storage_ref>_entity
- entities@<storage_ref>_confidence
"""
new_cols = {}
new_base_name = 'entities' if nlu_identifier == 'UNIQUE' else f'entities_{nlu_identifier}'
for c in cols:
if '_ocr_confidence' in c:
new_cols['meta_text_entity_confidence'] = f'{new_base_name}_confidence'
if '_token' in c:
new_cols['meta_text_entity_token'] = f'{new_base_name}_ner_entity'
if '_entity_x' in c:
new_cols['meta_text_entity_x'] = f'{new_base_name}_x_location'
if '_entity_y' in c:
new_cols['meta_text_entity_y'] = f'{new_base_name}_y_location'

# new_cols[c] = c
return new_cols

24 changes: 24 additions & 0 deletions nlu/pipe/col_substitution/substitution_map_OCR.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
Resolve Annotator Classes in the Pipeline to Extractor Configs and Methods
Every Annotator should have 2 configs. Some might offor multuple configs/method pairs, based on model_anno_obj/NLP reference.
- default/minimalistic -> Just the results of the annotations, no confidences or extra metadata
- with meta -> A config that leverages white/black list and gets the most relevant metadata
- with positions -> With Begins/Ends
- with sentence references -> Reeturn the sentence/chunk no. reference from the metadata.
If a document has multi-sentences, this will map a label back to a corrosponding sentence
"""
# from nlu.pipe.col_substitution.col_substitution_HC import *
from nlu.pipe.col_substitution.col_substitution_OS import *
from nlu.pipe.col_substitution.col_substitution_OCR import *

from sparkocr.transformers import *

OCR_anno2substitution_fn = {
VisualDocumentClassifier : {
'default': substitute_document_classifier_text_cols ,
},
VisualDocumentNerLilt : {
'default': substitute_document_ner_cols,
},

}
6 changes: 3 additions & 3 deletions nlu/pipe/col_substitution/substitution_map_OS.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@
SentenceEmbeddings: {
'default': substitute_sent_embed_cols,
},
MPNetEmbeddings: {
'default': substitute_sent_embed_cols,
},
# MPNetEmbeddings: {
# 'default': substitute_sent_embed_cols,
# },
Tokenizer: {
'default': substitute_tokenizer_cols,
},
Expand Down
2 changes: 1 addition & 1 deletion nlu/pipe/extractors/extractor_base_data_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ class SparkOCRExtractorConfig(SparkNLPExtractorConfig):
get_image_resolution: bool = field(default=False)
get_image_data: bool = field(default=False)
# General OCR fields
# get_path :bool = field(default = False)# origin is path
get_path: bool = field(default=False)# origin is path
get_modification_time: bool = field(default=False)
get_length: bool = field(default=False)
get_page_num: bool = field(default=False)
Expand Down
16 changes: 16 additions & 0 deletions nlu/pipe/extractors/extractor_configs_OCR.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,22 @@ def default_visual_classifier_config(output_col_prefix='visual_classifier'):
description='Gets label and confidence of visual classifier',
)

def default_visual_ner_config(output_col_prefix='visual_ocr'):
return SparkOCRExtractorConfig(
get_text=True,
get_begin=True,
get_end=True,
get_result=True,
get_meta=True,
get_full_meta=True,
get_image_data=True,
get_path=True,
get_annotator_type=False,
output_col_prefix=output_col_prefix,
meta_white_list=['entity', 'confidence', 'sentence', 'chunk'],
name='visual_ner label, confidence and entities ',
description='Gets label, entities and confidence of visual ner',
)

def default_binary_to_image_config(output_col_prefix='binary_image'):
return SparkOCRExtractorConfig(
Expand Down
10 changes: 8 additions & 2 deletions nlu/pipe/extractors/extractor_methods/base_extractor_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,15 @@ def extract_base_sparkocr_features(row: pd.Series, configs: SparkOCRExtractorCon
else:
return {'visual_classifier_confidence': row}

# if 'FULL binary to image extractor ' in configs.name:
# if not isinstance(row, str):
# return {'path': row}


else:
# # OCR unpackers (TODO WIP)
# unpack_text = lambda x: unpack_dict_list(x, 'text')
# # unpack_image = lambda x : unpack_dict_list(x, 'TODO') # is data?
# unpack_image_origin = lambda x: unpack_dict_list(x, 'origin')
# # unpack_image = lambda x : unpack_dict_list(x, 'TODO') # is data? # unpack_image_origin = lambda x: unpack_dict_list(x, 'origin')
# unpack_image_height = lambda x: unpack_dict_list(x, 'height')
# unpack_image_width = lambda x: unpack_dict_list(x, 'width')
# unpack_image_n_channels = lambda x: unpack_dict_list(x, 'nChannels')
Expand Down Expand Up @@ -317,6 +321,8 @@ def apply_extractors_and_merge(df, anno_2_ex_config, keep_stranger_features, str
extractor = lambda c: df[c].apply(extract_master, configs=anno_2_ex_config[c])
keep_strangers = lambda c: df[c]

stranger_features.append('path') if 'path' in df.columns and 'text_entity' in anno_2_ex_config.keys() else None

# merged_extraction_df
# apply the extract_master together with it's configs to every column and geenrate a list of output DF's, one per Spark NLP COL
# TODO handle MULTI-COL-OUTPUT. If Anno has multi cols, then we either needs multiple keys in anno_2_ex or use something besides
Expand Down
4 changes: 3 additions & 1 deletion nlu/spellbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -11303,7 +11303,8 @@ class Spellbook:
'ppt2table': OCR_NODE_IDS.PPT2TEXT_TABLE,
'classify.image': OCR_NODE_IDS.VISUAL_DOCUMENT_CLASSIFIER,
'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482',
'en.image_table_detector':'general_model_table_detection_v2'
'en.image_table_detector':'general_model_table_detection_v2',
'en.lilt_roberta_funds.v1': 'lilt_roberta_funsd_v1',
}

# ocr_model_references = {
Expand Down Expand Up @@ -16273,6 +16274,7 @@ class Spellbook:
'general_model_table_detection_v2': 'ImageTableDetector',
'image_table_cell_detector': 'ImageTableCellDetector',
'image_table_cell2text_table': 'ImageCellsToTextTable',
'lilt_roberta_funsd_v1': 'VisualDocumentNer',
'instructor_large':'InstructorEmbeddings',
'instructor_base':'InstructorEmbeddings',
'initial_model': 'MPNetEmbeddings',
Expand Down
2 changes: 2 additions & 0 deletions nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,8 @@ class AnnoClassRef:
OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR: 'ImageTableCellDetector',
OCR_NODE_IDS.IMAGE_TABLE_CELL2TEXT_TABLE: 'ImageCellsToTextTable',
OCR_NODE_IDS.IMAGE_SPLIT_REGIONS: 'ImageSplitRegions',
OCR_NODE_IDS.VISUAL_DOCUMENT_NER: 'VisualDocumentNer',
OCR_NODE_IDS.HOCR_TOKENIZER: 'HocrTokenizer',
}

@staticmethod
Expand Down
Loading

0 comments on commit 32609ec

Please sign in to comment.