Skip to content

Commit

Permalink
Added Visual Form Relation Extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
gadde5300 committed May 5, 2024
1 parent 69c9452 commit 6efe108
Show file tree
Hide file tree
Showing 20 changed files with 771 additions and 5 deletions.
675 changes: 675 additions & 0 deletions examples/colab/ocr/ocr_form_relation.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion examples/colab/ocr/table_extraction.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2752,4 +2752,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
}
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

class FormRelationExtractor:
@staticmethod
def get_default_model():
from sparkocr.transformers import FormRelationExtractor
return FormRelationExtractor() \
.setInputCol("text_entity") \
.setOutputCol("ocr_relations")
10 changes: 10 additions & 0 deletions nlu/pipe/col_substitution/col_substitution_OCR.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,13 @@ def substitute_document_ner_cols(c, cols, nlu_identifier):
# new_cols[c] = c
return new_cols

def substitute_form_extractor_text_cols(c, cols, is_unique=True, nlu_identifier=''):
new_cols = {}
for c in cols:
if 'meta_visual_classifier_prediction_entity1' in c:
new_cols['meta_visual_classifier_prediction_entity1'] = 'form_relation_prediction_key'
if 'meta_visual_classifier_prediction_entity2' in c:
new_cols['meta_visual_classifier_prediction_entity2'] = 'form_relation_prediction_value'
# if 'path' in c:
# new_cols['path'] = 'file_path'
return new_cols
3 changes: 3 additions & 0 deletions nlu/pipe/col_substitution/substitution_map_OCR.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,8 @@
VisualDocumentNerLilt : {
'default': substitute_document_ner_cols,
},
FormRelationExtractor : {
'default': substitute_form_extractor_text_cols,
}

}
10 changes: 10 additions & 0 deletions nlu/pipe/extractors/extractor_configs_OCR.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@ def default_visual_classifier_config(output_col_prefix='visual_classifier'):
description='Gets label and confidence of visual classifier',
)

def default_form_relation_extractor_config(output_col_prefix='extracted_relations'):
return SparkOCRExtractorConfig(
output_col_prefix=output_col_prefix,
get_result=True,
get_full_meta=True,
name='full_relation_extraction',
description='Get relation extraction result and all metadata, with positions of entities',
)


def default_visual_ner_config(output_col_prefix='visual_ocr'):
return SparkOCRExtractorConfig(
get_text=True,
Expand Down
4 changes: 3 additions & 1 deletion nlu/spellbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -11324,6 +11324,7 @@ class Spellbook:
'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482',
'en.image_table_detector':'general_model_table_detection_v2',
'en.lilt_roberta_funds.v1': 'lilt_roberta_funsd_v1',
'visual_form_relation_extractor': OCR_NODE_IDS.FORM_RELATION_EXTRACTOR,
}

# ocr_model_references = {
Expand Down Expand Up @@ -16299,7 +16300,8 @@ class Spellbook:
'general_model_table_detection_v2': 'ImageTableDetector',
'image_table_cell_detector': 'ImageTableCellDetector',
'image_table_cell2text_table': 'ImageCellsToTextTable',
'lilt_roberta_funsd_v1': 'VisualDocumentNer',
'visual_form_relation_extractor':'FormRelationExtractor',
'lilt_roberta_funsd_v1': 'VisualDocumentNer',
'instructor_large':'InstructorEmbeddings',
'instructor_base':'InstructorEmbeddings',
'initial_model': 'MPNetEmbeddings',
Expand Down
1 change: 1 addition & 0 deletions nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ class AnnoClassRef:
OCR_NODE_IDS.IMAGE_SPLIT_REGIONS: 'ImageSplitRegions',
OCR_NODE_IDS.VISUAL_DOCUMENT_NER: 'VisualDocumentNer',
OCR_NODE_IDS.HOCR_TOKENIZER: 'HocrTokenizer',
OCR_NODE_IDS.FORM_RELATION_EXTRACTOR: 'FormRelationExtractor',
}

@staticmethod
Expand Down
26 changes: 24 additions & 2 deletions nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,17 +149,18 @@
from nlu.ocr_components.table_extractors.image2table.image2table import IMAGE_TABLE_DETECTOR
from nlu.ocr_components.visual_ner.visual_document_ner.visual_document_ner import VisualDocumentNer
from nlu.ocr_components.table_extractors.image2table_cell.image2table_cell import ImageTableCellDetector
from nlu.ocr_components.form_relation_extractor.form_relation_extractor import FormRelationExtractor
from nlu.ocr_components.table_extractors.image_table_cell2text.image_table_cell2text import ImageTable2Cell2TextTable
from nlu.ocr_components.utils.image_split_regions.image_split_regions import ImageSplitRegions
# from nlu.ocr_components.visual_classifiers.visual_doc_classifier.visual_doc_classifier import VisualDocClassifier
from nlu.pipe.col_substitution.col_substitution_HC import *
from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols, \
substitute_document_classifier_text_cols
substitute_document_classifier_text_cols, substitute_form_extractor_text_cols
from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols,substitute_document_ner_cols
from nlu.pipe.col_substitution.col_substitution_OS import *
from nlu.pipe.extractors.extractor_configs_HC import *
from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, \
default_visual_classifier_config
default_visual_classifier_config,default_form_relation_extractor_config
from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, default_visual_ner_config
from nlu.pipe.extractors.extractor_configs_OS import *
from nlu.pipe.nlu_component import NluComponent
Expand Down Expand Up @@ -4519,4 +4520,25 @@ class ComponentUniverse:
applicable_file_types=['JPG', 'JPEG']
),

O_A.FORM_RELATION_EXTRACTOR: partial(NluComponent,
name=O_A.FORM_RELATION_EXTRACTOR,
type=T.TEXT_RECOGNIZER,
get_default_model=FormRelationExtractor.get_default_model,
# TODO EXtractor0
pdf_extractor_methods={'default': default_form_relation_extractor_config},
# TODO substitor
pdf_col_name_substitutor=substitute_form_extractor_text_cols,
output_level=L.RELATION,
node=OCR_FEATURE_NODES.nodes[O_A.FORM_RELATION_EXTRACTOR],
description='Convert text to PDF file',
provider=ComponentBackends.ocr,
license=Licenses.ocr,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=O_A.FORM_RELATION_EXTRACTOR,
jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[
O_A.FORM_RELATION_EXTRACTOR],
applicable_file_types=['DOCX', 'DOC'],
),

}
2 changes: 2 additions & 0 deletions nlu/universe/feature_node_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,8 @@ class OCR_NODE_IDS:
"""All available Feature nodes in OCR
Used to cast the pipeline dependency resolution algorithm into an abstract graph
"""

FORM_RELATION_EXTRACTOR = JslAnnoId('visual_form_relation_extractor')
# Visual Document Understanding
VISUAL_DOCUMENT_CLASSIFIER = JslAnnoId('visual_document_classifier')
VISUAL_DOCUMENT_NER = JslAnnoId('visual_document_ner')
Expand Down
3 changes: 3 additions & 0 deletions nlu/universe/feature_node_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,9 @@ class OCR_FEATURE_NODES:
A = OCR_NODE_IDS
F = OCR_FEATURES
nodes = {
A.FORM_RELATION_EXTRACTOR: OcrFeatureNode(A.FORM_RELATION_EXTRACTOR, [F.TEXT_ENTITY],
[F.VISUAL_RELATION]),

A.VISUAL_DOCUMENT_CLASSIFIER: OcrFeatureNode(A.VISUAL_DOCUMENT_CLASSIFIER, [F.HOCR],
[F.VISUAL_CLASSIFIER_PREDICTION, F.VISUAL_CLASSIFIER_CONFIDENCE, F.FILE_PATH]),

Expand Down
3 changes: 2 additions & 1 deletion nlu/universe/feature_resolutions.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,5 +116,6 @@ class FeatureResolutions:
ComponentUniverse.components[OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR]),
OCR_FEATURES.TEXT_DOCUMENT_TOKENIZED: ResolvedFeature(OCR_NODE_IDS.HOCR_TOKENIZER, OCR_NODE_IDS.HOCR_TOKENIZER, 'xx', False,
ComponentUniverse.components[OCR_NODE_IDS.HOCR_TOKENIZER]),

OCR_FEATURES.TEXT_ENTITY: ResolvedFeature(OCR_NODE_IDS.VISUAL_DOCUMENT_NER, OCR_NODE_IDS.VISUAL_DOCUMENT_NER,
'xx', False,ComponentUniverse.components[OCR_NODE_IDS.VISUAL_DOCUMENT_NER]),
}
2 changes: 2 additions & 0 deletions nlu/universe/feature_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,9 @@ class OCR_FEATURES(JslFeature):
PREDICTION_CONFIDENCE = JslFeature("prediction_confidence") # TODO is this just int or some struct?
VISUAL_CLASSIFIER_CONFIDENCE = JslFeature("visual_classifier_confidence")
VISUAL_CLASSIFIER_PREDICTION = JslFeature("visual_classifier_prediction")
VISUAL_RELATION = JslFeature("visual_classifier_prediction")

FORM_RELATION = JslFeature('ocr_relations')

class NLP_HC_FEATURES(JslFeature):
"""
Expand Down
File renamed without changes
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
File renamed without changes
27 changes: 27 additions & 0 deletions tests/nlu_ocr_tests/ocr_form_relation_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import sys

sys.path.append(os.getcwd())
import unittest
import nlu

os.environ["PYTHONPATH"] = "F:/Work/repos/nlu"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from johnsnowlabs import nlp, visual

# nlp.install(json_license_path='license.json',visual=True)
nlp.start(visual=True)

class OcrTest(unittest.TestCase):

def test_classify_document(self):
# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET)
# text that we generate PDF to has to come from an image struct!
# We need convert text to img struct!
p = nlu.load('visual_form_relation_extractor').predict(['tests/datasets/ocr/form.png','tests/datasets/ocr/form2.png'])
for df in p:
print(p.to_markdown())

if __name__ == '__main__':
unittest.main()
Empty file added tests/nlu_ocr_tests/ocr_ner.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 6efe108

Please sign in to comment.