diff --git a/examples/colab/ocr/ocr_form_relation.ipynb b/examples/colab/ocr/ocr_form_relation.ipynb new file mode 100644 index 00000000..3c6d5952 --- /dev/null +++ b/examples/colab/ocr/ocr_form_relation.ipynb @@ -0,0 +1,675 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples/colab/ocr/ocr_form_relation_extractor.ipynb)\n", + "\n", + "[Tutorial Notebook](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples/colab/ocr/ocr_form_relation_extractor.ipynb \"https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples/colab/ocr/ocr_form_relation_extractor.ipynb\")\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "# **FormRelationExtractor**\n", + "\n", + "\n", + "The **FormRelationExtractor** is a tool designed to identify the relationships between keys and values. It’s particularly useful in the context of data extracted by a Named Entity Recognition (NER) system, such as VisualDocumentNER.\n", + "\n", + "**All the available models:**\n", + "\n", + "| NLU Spell | Transformer Class |\n", + "|----------------------|-----------------------------------------------------------------------------------------|\n", + "| nlu.load(`visual_form_relation_extractor`) | [FormRelationExtractor](https://nlp.johnsnowlabs.com/docs/en/ocr_visual_document_understanding#formrelationextractor) |" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## **Install NLU**" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "!pip install johnsnowlabs\n", + "nlp.install(visual=True,force_browser=True)\n", + "nlp.start(visual=True)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## **Form Relation Extraction**" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Parsed Nlu_ref=visual_form_relation_extractor as lang=en\n", + "Parsed Nlu_ref=visual_form_relation_extractor as lang=en\n", + "Parsed Nlu_ref=visual_form_relation_extractor as lang=en\n", + "Parsed Nlu_ref=visual_form_relation_extractor as lang=en\n", + "06:59:10, INFO Parsed Nlu_ref=visual_form_relation_extractor as lang=en\n", + "Parsed Nlu_ref=visual_form_relation_extractor as lang=en\n", + "Parsed Nlu_ref=visual_form_relation_extractor as lang=en\n", + "Parsed Nlu_ref=visual_form_relation_extractor as lang=en\n", + "Parsed Nlu_ref=visual_form_relation_extractor as lang=en\n", + "06:59:10, INFO Parsed Nlu_ref=visual_form_relation_extractor as lang=en\n", + "Adding visual_form_relation_extractor to internal component_list\n", + "Adding visual_form_relation_extractor to internal component_list\n", + "Adding visual_form_relation_extractor to internal component_list\n", + "Adding visual_form_relation_extractor to internal component_list\n", + "06:59:10, INFO Adding visual_form_relation_extractor to internal component_list\n", + "Satisfying dependencies\n", + "Satisfying dependencies\n", + "Satisfying dependencies\n", + "Satisfying dependencies\n", + "06:59:10, INFO Satisfying dependencies\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "06:59:10, INFO ========================================================================\n", + "Resolution Status provided_features_no_ref = {'visual_classifier_prediction'}\n", + "Resolution Status provided_features_no_ref = {'visual_classifier_prediction'}\n", + "Resolution Status provided_features_no_ref = {'visual_classifier_prediction'}\n", + "Resolution Status provided_features_no_ref = {'visual_classifier_prediction'}\n", + "06:59:10, INFO Resolution Status provided_features_no_ref = {'visual_classifier_prediction'}\n", + "Resolution Status required_features_no_ref = {'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_entity'}\n", + "06:59:10, INFO Resolution Status required_features_no_ref = {'text_entity'}\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "06:59:10, INFO Resolution Status provided_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "06:59:10, INFO Resolution Status required_features_ref = set()\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "06:59:10, INFO Resolution Status is_trainable = False\n", + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "06:59:10, INFO Resolution Status conversion_candidates = []\n", + "Resolution Status missing_features_no_ref = {'text_entity'}\n", + "Resolution Status missing_features_no_ref = {'text_entity'}\n", + "Resolution Status missing_features_no_ref = {'text_entity'}\n", + "Resolution Status missing_features_no_ref = {'text_entity'}\n", + "06:59:10, INFO Resolution Status missing_features_no_ref = {'text_entity'}\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "06:59:10, INFO Resolution Status conversion_candidates = set()\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "06:59:10, INFO ========================================================================\n", + "Getting default for missing_feature_type=text_entity\n", + "Getting default for missing_feature_type=text_entity\n", + "Getting default for missing_feature_type=text_entity\n", + "Getting default for missing_feature_type=text_entity\n", + "06:59:10, INFO Getting default for missing_feature_type=text_entity\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "Warning::Spark Session already created, some configs may not take.\n", + "lilt_roberta_funsd_v1 download started this may take some time.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Resolved for missing components the following NLU components : []\n", + "Resolved for missing components the following NLU components : []\n", + "Resolved for missing components the following NLU components : []\n", + "Resolved for missing components the following NLU components : []\n", + "06:59:11, INFO Resolved for missing components the following NLU components : []\n", + "adding visual_document_ner\n", + "adding visual_document_ner\n", + "adding visual_document_ner\n", + "adding visual_document_ner\n", + "06:59:11, INFO adding visual_document_ner\n", + "Adding visual_document_ner to internal component_list\n", + "Adding visual_document_ner to internal component_list\n", + "Adding visual_document_ner to internal component_list\n", + "Adding visual_document_ner to internal component_list\n", + "06:59:11, INFO Adding visual_document_ner to internal component_list\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "06:59:11, INFO ========================================================================\n", + "Resolution Status provided_features_no_ref = {'text_entity', 'visual_classifier_prediction'}\n", + "Resolution Status provided_features_no_ref = {'text_entity', 'visual_classifier_prediction'}\n", + "Resolution Status provided_features_no_ref = {'text_entity', 'visual_classifier_prediction'}\n", + "Resolution Status provided_features_no_ref = {'text_entity', 'visual_classifier_prediction'}\n", + "06:59:11, INFO Resolution Status provided_features_no_ref = {'text_entity', 'visual_classifier_prediction'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'text_entity'}\n", + "06:59:11, INFO Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'text_entity'}\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "06:59:11, INFO Resolution Status provided_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "06:59:11, INFO Resolution Status required_features_ref = set()\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "06:59:11, INFO Resolution Status is_trainable = False\n", + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "06:59:11, INFO Resolution Status conversion_candidates = []\n", + "Resolution Status missing_features_no_ref = {'text_tokenized', 'ocr_image'}\n", + "Resolution Status missing_features_no_ref = {'text_tokenized', 'ocr_image'}\n", + "Resolution Status missing_features_no_ref = {'text_tokenized', 'ocr_image'}\n", + "Resolution Status missing_features_no_ref = {'text_tokenized', 'ocr_image'}\n", + "06:59:11, INFO Resolution Status missing_features_no_ref = {'text_tokenized', 'ocr_image'}\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "06:59:11, INFO Resolution Status conversion_candidates = set()\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "06:59:11, INFO ========================================================================\n", + "Getting default for missing_feature_type=text_tokenized\n", + "Getting default for missing_feature_type=text_tokenized\n", + "Getting default for missing_feature_type=text_tokenized\n", + "Getting default for missing_feature_type=text_tokenized\n", + "06:59:11, INFO Getting default for missing_feature_type=text_tokenized\n", + "Getting default for missing_feature_type=ocr_image\n", + "Getting default for missing_feature_type=ocr_image\n", + "Getting default for missing_feature_type=ocr_image\n", + "Getting default for missing_feature_type=ocr_image\n", + "06:59:11, INFO Getting default for missing_feature_type=ocr_image\n", + "Resolved for missing components the following NLU components : [, ]\n", + "Resolved for missing components the following NLU components : [, ]\n", + "Resolved for missing components the following NLU components : [, ]\n", + "Resolved for missing components the following NLU components : [, ]\n", + "06:59:11, INFO Resolved for missing components the following NLU components : [, ]\n", + "adding hocr_tokenizer\n", + "adding hocr_tokenizer\n", + "adding hocr_tokenizer\n", + "adding hocr_tokenizer\n", + "06:59:11, INFO adding hocr_tokenizer\n", + "Adding hocr_tokenizer to internal component_list\n", + "Adding hocr_tokenizer to internal component_list\n", + "Adding hocr_tokenizer to internal component_list\n", + "Adding hocr_tokenizer to internal component_list\n", + "06:59:11, INFO Adding hocr_tokenizer to internal component_list\n", + "adding binary2image\n", + "adding binary2image\n", + "adding binary2image\n", + "adding binary2image\n", + "06:59:11, INFO adding binary2image\n", + "Adding binary2image to internal component_list\n", + "Adding binary2image to internal component_list\n", + "Adding binary2image to internal component_list\n", + "Adding binary2image to internal component_list\n", + "06:59:11, INFO Adding binary2image to internal component_list\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "06:59:11, INFO ========================================================================\n", + "Resolution Status provided_features_no_ref = {'text_entity', 'text_tokenized', 'ocr_image', 'visual_classifier_prediction'}\n", + "Resolution Status provided_features_no_ref = {'text_entity', 'text_tokenized', 'ocr_image', 'visual_classifier_prediction'}\n", + "Resolution Status provided_features_no_ref = {'text_entity', 'text_tokenized', 'ocr_image', 'visual_classifier_prediction'}\n", + "Resolution Status provided_features_no_ref = {'text_entity', 'text_tokenized', 'ocr_image', 'visual_classifier_prediction'}\n", + "06:59:11, INFO Resolution Status provided_features_no_ref = {'text_entity', 'text_tokenized', 'ocr_image', 'visual_classifier_prediction'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'hocr', 'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'hocr', 'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'hocr', 'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'hocr', 'text_entity'}\n", + "06:59:11, INFO Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'hocr', 'text_entity'}\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "06:59:11, INFO Resolution Status provided_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "06:59:11, INFO Resolution Status required_features_ref = set()\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "06:59:11, INFO Resolution Status is_trainable = False\n", + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "06:59:11, INFO Resolution Status conversion_candidates = []\n", + "Resolution Status missing_features_no_ref = {'hocr'}\n", + "Resolution Status missing_features_no_ref = {'hocr'}\n", + "Resolution Status missing_features_no_ref = {'hocr'}\n", + "Resolution Status missing_features_no_ref = {'hocr'}\n", + "06:59:11, INFO Resolution Status missing_features_no_ref = {'hocr'}\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "06:59:11, INFO Resolution Status conversion_candidates = set()\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "06:59:11, INFO ========================================================================\n", + "Getting default for missing_feature_type=hocr\n", + "Getting default for missing_feature_type=hocr\n", + "Getting default for missing_feature_type=hocr\n", + "Getting default for missing_feature_type=hocr\n", + "06:59:11, INFO Getting default for missing_feature_type=hocr\n", + "Resolved for missing components the following NLU components : []\n", + "Resolved for missing components the following NLU components : []\n", + "Resolved for missing components the following NLU components : []\n", + "Resolved for missing components the following NLU components : []\n", + "06:59:11, INFO Resolved for missing components the following NLU components : []\n", + "adding image2hocr\n", + "adding image2hocr\n", + "adding image2hocr\n", + "adding image2hocr\n", + "06:59:11, INFO adding image2hocr\n", + "Adding image2hocr to internal component_list\n", + "Adding image2hocr to internal component_list\n", + "Adding image2hocr to internal component_list\n", + "Adding image2hocr to internal component_list\n", + "06:59:11, INFO Adding image2hocr to internal component_list\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "06:59:11, INFO ========================================================================\n", + "Resolution Status provided_features_no_ref = {'text_tokenized', 'visual_classifier_prediction', 'hocr', 'ocr_image', 'text_entity'}\n", + "Resolution Status provided_features_no_ref = {'text_tokenized', 'visual_classifier_prediction', 'hocr', 'ocr_image', 'text_entity'}\n", + "Resolution Status provided_features_no_ref = {'text_tokenized', 'visual_classifier_prediction', 'hocr', 'ocr_image', 'text_entity'}\n", + "Resolution Status provided_features_no_ref = {'text_tokenized', 'visual_classifier_prediction', 'hocr', 'ocr_image', 'text_entity'}\n", + "06:59:11, INFO Resolution Status provided_features_no_ref = {'text_tokenized', 'visual_classifier_prediction', 'hocr', 'ocr_image', 'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'hocr', 'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'hocr', 'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'hocr', 'text_entity'}\n", + "Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'hocr', 'text_entity'}\n", + "06:59:11, INFO Resolution Status required_features_no_ref = {'text_tokenized', 'ocr_image', 'hocr', 'text_entity'}\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "Resolution Status provided_features_ref = set()\n", + "06:59:11, INFO Resolution Status provided_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "06:59:11, INFO Resolution Status required_features_ref = set()\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "Resolution Status is_trainable = False\n", + "06:59:11, INFO Resolution Status is_trainable = False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Approximate size to download 419.6 MB\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "Resolution Status conversion_candidates = []\n", + "06:59:11, INFO Resolution Status conversion_candidates = []\n", + "Resolution Status missing_features_no_ref = set()\n", + "Resolution Status missing_features_no_ref = set()\n", + "Resolution Status missing_features_no_ref = set()\n", + "Resolution Status missing_features_no_ref = set()\n", + "06:59:11, INFO Resolution Status missing_features_no_ref = set()\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "Resolution Status conversion_candidates = set()\n", + "06:59:11, INFO Resolution Status conversion_candidates = set()\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "========================================================================\n", + "06:59:11, INFO ========================================================================\n", + "!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!* ALL DEPENDENCIES SATISFIED !*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*\n", + "!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!* ALL DEPENDENCIES SATISFIED !*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*\n", + "!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!* ALL DEPENDENCIES SATISFIED !*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*\n", + "!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!* ALL DEPENDENCIES SATISFIED !*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*\n", + "06:59:11, INFO !*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!* ALL DEPENDENCIES SATISFIED !*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*\n", + "Fixing column names\n", + "Fixing column names\n", + "Fixing column names\n", + "Fixing column names\n", + "06:59:11, INFO Fixing column names\n", + "Fixing input and output column names\n", + "Fixing input and output column names\n", + "Fixing input and output column names\n", + "Fixing input and output column names\n", + "06:59:11, INFO Fixing input and output column names\n", + "Checking for component_to_resolve visual_form_relation_extractor wether inputs {'text_entity'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve visual_form_relation_extractor wether inputs {'text_entity'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve visual_form_relation_extractor wether inputs {'text_entity'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve visual_form_relation_extractor wether inputs {'text_entity'} is satisfied by another component_to_resolve in the component_list \n", + "06:59:11, INFO Checking for component_to_resolve visual_form_relation_extractor wether inputs {'text_entity'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve visual_document_ner wether inputs {'text_tokenized', 'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve visual_document_ner wether inputs {'text_tokenized', 'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve visual_document_ner wether inputs {'text_tokenized', 'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve visual_document_ner wether inputs {'text_tokenized', 'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "06:59:11, INFO Checking for component_to_resolve visual_document_ner wether inputs {'text_tokenized', 'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve hocr_tokenizer wether inputs {'hocr'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve hocr_tokenizer wether inputs {'hocr'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve hocr_tokenizer wether inputs {'hocr'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve hocr_tokenizer wether inputs {'hocr'} is satisfied by another component_to_resolve in the component_list \n", + "06:59:11, INFO Checking for component_to_resolve hocr_tokenizer wether inputs {'hocr'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve binary2image wether inputs {'content', 'path'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve binary2image wether inputs {'content', 'path'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve binary2image wether inputs {'content', 'path'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve binary2image wether inputs {'content', 'path'} is satisfied by another component_to_resolve in the component_list \n", + "06:59:11, INFO Checking for component_to_resolve binary2image wether inputs {'content', 'path'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve image2hocr wether inputs {'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve image2hocr wether inputs {'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve image2hocr wether inputs {'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve image2hocr wether inputs {'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "06:59:11, INFO Checking for component_to_resolve image2hocr wether inputs {'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "Optimizing component_list component_to_resolve order\n", + "Optimizing component_list component_to_resolve order\n", + "Optimizing component_list component_to_resolve order\n", + "Optimizing component_list component_to_resolve order\n", + "06:59:11, INFO Optimizing component_list component_to_resolve order\n", + "Starting to optimize component_to_resolve order \n", + "Starting to optimize component_to_resolve order \n", + "Starting to optimize component_to_resolve order \n", + "Starting to optimize component_to_resolve order \n", + "06:59:11, INFO Starting to optimize component_to_resolve order \n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "06:59:11, INFO Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve binary2image\n", + "Optimizing order for component_to_resolve binary2image\n", + "Optimizing order for component_to_resolve binary2image\n", + "Optimizing order for component_to_resolve binary2image\n", + "06:59:11, INFO Optimizing order for component_to_resolve binary2image\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "06:59:11, INFO Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve image2hocr\n", + "Optimizing order for component_to_resolve image2hocr\n", + "Optimizing order for component_to_resolve image2hocr\n", + "Optimizing order for component_to_resolve image2hocr\n", + "06:59:11, INFO Optimizing order for component_to_resolve image2hocr\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "06:59:11, INFO Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve image2hocr\n", + "Optimizing order for component_to_resolve image2hocr\n", + "Optimizing order for component_to_resolve image2hocr\n", + "Optimizing order for component_to_resolve image2hocr\n", + "06:59:11, INFO Optimizing order for component_to_resolve image2hocr\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve hocr_tokenizer\n", + "06:59:11, INFO Optimizing order for component_to_resolve hocr_tokenizer\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_document_ner\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_document_ner\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "06:59:11, INFO Optimizing order for component_to_resolve visual_form_relation_extractor\n", + "Renaming duplicates cols\n", + "Renaming duplicates cols\n", + "Renaming duplicates cols\n", + "Renaming duplicates cols\n", + "06:59:11, INFO Renaming duplicates cols\n", + "Done with component_list optimizing\n", + "Done with component_list optimizing\n", + "Done with component_list optimizing\n", + "Done with component_list optimizing\n", + "06:59:11, INFO Done with component_list optimizing\n", + "Fitting on empty Dataframe, could not infer correct training method. This is intended for non-trainable pipelines.\n", + "Fitting on empty Dataframe, could not infer correct training method. This is intended for non-trainable pipelines.\n", + "Fitting on empty Dataframe, could not infer correct training method. This is intended for non-trainable pipelines.\n", + "Fitting on empty Dataframe, could not infer correct training method. This is intended for non-trainable pipelines.\n", + "06:59:11, INFO Fitting on empty Dataframe, could not infer correct training method. This is intended for non-trainable pipelines.\n", + "Configuring Light Pipeline Usage\n", + "Configuring Light Pipeline Usage\n", + "Configuring Light Pipeline Usage\n", + "Configuring Light Pipeline Usage\n", + "06:59:11, INFO Configuring Light Pipeline Usage\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "Warning::Spark Session already created, some configs may not take.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Inferred and set output level of pipeline to relation\n", + "Inferred and set output level of pipeline to relation\n", + "Inferred and set output level of pipeline to relation\n", + "Inferred and set output level of pipeline to relation\n", + "06:59:15, INFO Inferred and set output level of pipeline to relation\n", + "Extracting for same_level_cols = ['meta_visual_classifier_prediction_bbox1', 'meta_visual_classifier_prediction_entity1_begin', 'meta_visual_classifier_prediction_x', 'meta_visual_classifier_prediction_bbox2', 'meta_visual_classifier_prediction_entity1_end', 'meta_visual_classifier_prediction_entity2', 'meta_visual_classifier_prediction_entity1', 'meta_visual_classifier_prediction_entity2_end', 'meta_visual_classifier_prediction_height', 'meta_visual_classifier_prediction_y', 'meta_visual_classifier_prediction_width', 'meta_visual_classifier_prediction_entity2_begin']\n", + "\n", + "Extracting for same_level_cols = ['meta_visual_classifier_prediction_bbox1', 'meta_visual_classifier_prediction_entity1_begin', 'meta_visual_classifier_prediction_x', 'meta_visual_classifier_prediction_bbox2', 'meta_visual_classifier_prediction_entity1_end', 'meta_visual_classifier_prediction_entity2', 'meta_visual_classifier_prediction_entity1', 'meta_visual_classifier_prediction_entity2_end', 'meta_visual_classifier_prediction_height', 'meta_visual_classifier_prediction_y', 'meta_visual_classifier_prediction_width', 'meta_visual_classifier_prediction_entity2_begin']\n", + "\n", + "Extracting for same_level_cols = ['meta_visual_classifier_prediction_bbox1', 'meta_visual_classifier_prediction_entity1_begin', 'meta_visual_classifier_prediction_x', 'meta_visual_classifier_prediction_bbox2', 'meta_visual_classifier_prediction_entity1_end', 'meta_visual_classifier_prediction_entity2', 'meta_visual_classifier_prediction_entity1', 'meta_visual_classifier_prediction_entity2_end', 'meta_visual_classifier_prediction_height', 'meta_visual_classifier_prediction_y', 'meta_visual_classifier_prediction_width', 'meta_visual_classifier_prediction_entity2_begin']\n", + "\n", + "Extracting for same_level_cols = ['meta_visual_classifier_prediction_bbox1', 'meta_visual_classifier_prediction_entity1_begin', 'meta_visual_classifier_prediction_x', 'meta_visual_classifier_prediction_bbox2', 'meta_visual_classifier_prediction_entity1_end', 'meta_visual_classifier_prediction_entity2', 'meta_visual_classifier_prediction_entity1', 'meta_visual_classifier_prediction_entity2_end', 'meta_visual_classifier_prediction_height', 'meta_visual_classifier_prediction_y', 'meta_visual_classifier_prediction_width', 'meta_visual_classifier_prediction_entity2_begin']\n", + "\n", + "06:59:19, INFO Extracting for same_level_cols = ['meta_visual_classifier_prediction_bbox1', 'meta_visual_classifier_prediction_entity1_begin', 'meta_visual_classifier_prediction_x', 'meta_visual_classifier_prediction_bbox2', 'meta_visual_classifier_prediction_entity1_end', 'meta_visual_classifier_prediction_entity2', 'meta_visual_classifier_prediction_entity1', 'meta_visual_classifier_prediction_entity2_end', 'meta_visual_classifier_prediction_height', 'meta_visual_classifier_prediction_y', 'meta_visual_classifier_prediction_width', 'meta_visual_classifier_prediction_entity2_begin']\n", + "\n" + ] + } + ], + "source": [ + "from johnsnowlabs import nlp, visual\n", + "import nlu\n", + "\n", + "model = nlu.load('visual_form_relation_extractor',verbose=True)\n", + "res = model.predict(['form.png','form2.png'])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-05T01:29:19.227738100Z", + "start_time": "2024-05-05T01:29:10.870752Z" + } + } + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [ + { + "data": { + "text/plain": " form_relation_prediction_key \\\n0 patient name \n0 lab no \n0 uhid \n0 sample date \n0 age / gender \n0 report date \n0 prescribed by \n0 opd / ipd opd \n0 sample type \n0 specimen \n0 result : \n1 name : \n1 study date : \n1 bp : \n1 mrn : \n1 patient location : \n1 hr : \n1 dob : \n1 gender : \n1 height : \n1 age : \n1 weight : \n1 reason for study : \n1 bsa : \n1 history : \n1 medications : \n1 performed . \n1 . \n\n form_relation_prediction_value \\\n0 mrs meera singh \n0 477737 \n0 248275 \n0 20 / 04 / 2019 1 : 55 9 \n0 33 yrs / female \n0 22 / 04 / 2019 10 : 56 \n0 dr . chetna jain \n0 sec - \n0 urine \n0 mid stream urine \n0 culture sterile after 48 hours of aerobic incu... \n1 dribbler , bbb \n1 12 - 09 - 2006 , 6 : 34 \n1 120 / 80 mmhg \n1 12341820060912 \n1 room \n1 100 bpm \n1 19 - 06 - 1979 \n1 male \n1 123 cm \n1 27 years \n1 25 kg \n1 mi \n1 0 . 92 m \n1 asfgfdgsdg \n1 heparine , paracetamol \n1 the study technically limited . \n1 no \n\n path \n0 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n0 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n0 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n0 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n0 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n0 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n0 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n0 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n0 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n0 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n0 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... \n1 file:/F:/Work/repos/nlu_new/ner/nlu/examples/c... ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
form_relation_prediction_keyform_relation_prediction_valuepath
0patient namemrs meera singhfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
0lab no477737file:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
0uhid248275file:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
0sample date20 / 04 / 2019 1 : 55 9file:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
0age / gender33 yrs / femalefile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
0report date22 / 04 / 2019 10 : 56file:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
0prescribed bydr . chetna jainfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
0opd / ipd opdsec -file:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
0sample typeurinefile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
0specimenmid stream urinefile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
0result :culture sterile after 48 hours of aerobic incu...file:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1name :dribbler , bbbfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1study date :12 - 09 - 2006 , 6 : 34file:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1bp :120 / 80 mmhgfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1mrn :12341820060912file:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1patient location :roomfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1hr :100 bpmfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1dob :19 - 06 - 1979file:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1gender :malefile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1height :123 cmfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1age :27 yearsfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1weight :25 kgfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1reason for study :mifile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1bsa :0 . 92 mfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1history :asfgfdgsdgfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1medications :heparine , paracetamolfile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1performed .the study technically limited .file:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
1.nofile:/F:/Work/repos/nlu_new/ner/nlu/examples/c...
\n
" + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_filtered = res[['form_relation_prediction_key','form_relation_prediction_value','path']]\n", + "res_filtered" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-05T01:33:38.903202300Z", + "start_time": "2024-05-05T01:33:38.883843200Z" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "name": "myenv", + "language": "python", + "display_name": "myenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/colab/ocr/table_extraction.ipynb b/examples/colab/ocr/table_extraction.ipynb index b3e449ae..fd79fb83 100644 --- a/examples/colab/ocr/table_extraction.ipynb +++ b/examples/colab/ocr/table_extraction.ipynb @@ -2752,4 +2752,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/nlu/ocr_components/form_relation_extractor/__init__.py b/nlu/ocr_components/form_relation_extractor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/ocr_components/form_relation_extractor/form_relation_extractor.py b/nlu/ocr_components/form_relation_extractor/form_relation_extractor.py new file mode 100644 index 00000000..0d0ee446 --- /dev/null +++ b/nlu/ocr_components/form_relation_extractor/form_relation_extractor.py @@ -0,0 +1,8 @@ + +class FormRelationExtractor: + @staticmethod + def get_default_model(): + from sparkocr.transformers import FormRelationExtractor + return FormRelationExtractor() \ + .setInputCol("text_entity") \ + .setOutputCol("ocr_relations") diff --git a/nlu/pipe/col_substitution/col_substitution_OCR.py b/nlu/pipe/col_substitution/col_substitution_OCR.py index 9cabccd4..76991b50 100644 --- a/nlu/pipe/col_substitution/col_substitution_OCR.py +++ b/nlu/pipe/col_substitution/col_substitution_OCR.py @@ -96,3 +96,13 @@ def substitute_document_ner_cols(c, cols, nlu_identifier): # new_cols[c] = c return new_cols +def substitute_form_extractor_text_cols(c, cols, is_unique=True, nlu_identifier=''): + new_cols = {} + for c in cols: + if 'meta_visual_classifier_prediction_entity1' in c: + new_cols['meta_visual_classifier_prediction_entity1'] = 'form_relation_prediction_key' + if 'meta_visual_classifier_prediction_entity2' in c: + new_cols['meta_visual_classifier_prediction_entity2'] = 'form_relation_prediction_value' + # if 'path' in c: + # new_cols['path'] = 'file_path' + return new_cols \ No newline at end of file diff --git a/nlu/pipe/col_substitution/substitution_map_OCR.py b/nlu/pipe/col_substitution/substitution_map_OCR.py index 4387eb35..bbe83634 100644 --- a/nlu/pipe/col_substitution/substitution_map_OCR.py +++ b/nlu/pipe/col_substitution/substitution_map_OCR.py @@ -20,5 +20,8 @@ VisualDocumentNerLilt : { 'default': substitute_document_ner_cols, }, + FormRelationExtractor : { + 'default': substitute_form_extractor_text_cols, + } } diff --git a/nlu/pipe/extractors/extractor_configs_OCR.py b/nlu/pipe/extractors/extractor_configs_OCR.py index e9acda71..7b744dcc 100644 --- a/nlu/pipe/extractors/extractor_configs_OCR.py +++ b/nlu/pipe/extractors/extractor_configs_OCR.py @@ -28,6 +28,16 @@ def default_visual_classifier_config(output_col_prefix='visual_classifier'): description='Gets label and confidence of visual classifier', ) +def default_form_relation_extractor_config(output_col_prefix='extracted_relations'): + return SparkOCRExtractorConfig( + output_col_prefix=output_col_prefix, + get_result=True, + get_full_meta=True, + name='full_relation_extraction', + description='Get relation extraction result and all metadata, with positions of entities', + ) + + def default_visual_ner_config(output_col_prefix='visual_ocr'): return SparkOCRExtractorConfig( get_text=True, diff --git a/nlu/spellbook.py b/nlu/spellbook.py index 4c3cf5ca..6e26c034 100644 --- a/nlu/spellbook.py +++ b/nlu/spellbook.py @@ -11324,6 +11324,7 @@ class Spellbook: 'en.classify_image.tabacco': 'visual_document_classifier_tobacco3482', 'en.image_table_detector':'general_model_table_detection_v2', 'en.lilt_roberta_funds.v1': 'lilt_roberta_funsd_v1', + 'visual_form_relation_extractor': OCR_NODE_IDS.FORM_RELATION_EXTRACTOR, } # ocr_model_references = { @@ -16299,7 +16300,8 @@ class Spellbook: 'general_model_table_detection_v2': 'ImageTableDetector', 'image_table_cell_detector': 'ImageTableCellDetector', 'image_table_cell2text_table': 'ImageCellsToTextTable', - 'lilt_roberta_funsd_v1': 'VisualDocumentNer', + 'visual_form_relation_extractor':'FormRelationExtractor', + 'lilt_roberta_funsd_v1': 'VisualDocumentNer', 'instructor_large':'InstructorEmbeddings', 'instructor_base':'InstructorEmbeddings', 'initial_model': 'MPNetEmbeddings', diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py index 318b85ac..92157e8a 100644 --- a/nlu/universe/annotator_class_universe.py +++ b/nlu/universe/annotator_class_universe.py @@ -317,6 +317,7 @@ class AnnoClassRef: OCR_NODE_IDS.IMAGE_SPLIT_REGIONS: 'ImageSplitRegions', OCR_NODE_IDS.VISUAL_DOCUMENT_NER: 'VisualDocumentNer', OCR_NODE_IDS.HOCR_TOKENIZER: 'HocrTokenizer', + OCR_NODE_IDS.FORM_RELATION_EXTRACTOR: 'FormRelationExtractor', } @staticmethod diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py index ae8b5592..f46f257b 100644 --- a/nlu/universe/component_universes.py +++ b/nlu/universe/component_universes.py @@ -149,17 +149,18 @@ from nlu.ocr_components.table_extractors.image2table.image2table import IMAGE_TABLE_DETECTOR from nlu.ocr_components.visual_ner.visual_document_ner.visual_document_ner import VisualDocumentNer from nlu.ocr_components.table_extractors.image2table_cell.image2table_cell import ImageTableCellDetector +from nlu.ocr_components.form_relation_extractor.form_relation_extractor import FormRelationExtractor from nlu.ocr_components.table_extractors.image_table_cell2text.image_table_cell2text import ImageTable2Cell2TextTable from nlu.ocr_components.utils.image_split_regions.image_split_regions import ImageSplitRegions # from nlu.ocr_components.visual_classifiers.visual_doc_classifier.visual_doc_classifier import VisualDocClassifier from nlu.pipe.col_substitution.col_substitution_HC import * from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols, \ - substitute_document_classifier_text_cols + substitute_document_classifier_text_cols, substitute_form_extractor_text_cols from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols,substitute_document_ner_cols from nlu.pipe.col_substitution.col_substitution_OS import * from nlu.pipe.extractors.extractor_configs_HC import * from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, \ - default_visual_classifier_config + default_visual_classifier_config,default_form_relation_extractor_config from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, default_visual_ner_config from nlu.pipe.extractors.extractor_configs_OS import * from nlu.pipe.nlu_component import NluComponent @@ -4519,4 +4520,25 @@ class ComponentUniverse: applicable_file_types=['JPG', 'JPEG'] ), + O_A.FORM_RELATION_EXTRACTOR: partial(NluComponent, + name=O_A.FORM_RELATION_EXTRACTOR, + type=T.TEXT_RECOGNIZER, + get_default_model=FormRelationExtractor.get_default_model, + # TODO EXtractor0 + pdf_extractor_methods={'default': default_form_relation_extractor_config}, + # TODO substitor + pdf_col_name_substitutor=substitute_form_extractor_text_cols, + output_level=L.RELATION, + node=OCR_FEATURE_NODES.nodes[O_A.FORM_RELATION_EXTRACTOR], + description='Convert text to PDF file', + provider=ComponentBackends.ocr, + license=Licenses.ocr, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=O_A.FORM_RELATION_EXTRACTOR, + jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[ + O_A.FORM_RELATION_EXTRACTOR], + applicable_file_types=['DOCX', 'DOC'], + ), + } diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py index cac595f2..9b8310f4 100644 --- a/nlu/universe/feature_node_ids.py +++ b/nlu/universe/feature_node_ids.py @@ -319,6 +319,8 @@ class OCR_NODE_IDS: """All available Feature nodes in OCR Used to cast the pipeline dependency resolution algorithm into an abstract graph """ + + FORM_RELATION_EXTRACTOR = JslAnnoId('visual_form_relation_extractor') # Visual Document Understanding VISUAL_DOCUMENT_CLASSIFIER = JslAnnoId('visual_document_classifier') VISUAL_DOCUMENT_NER = JslAnnoId('visual_document_ner') diff --git a/nlu/universe/feature_node_universes.py b/nlu/universe/feature_node_universes.py index 25a3657c..fe6b83df 100644 --- a/nlu/universe/feature_node_universes.py +++ b/nlu/universe/feature_node_universes.py @@ -301,6 +301,9 @@ class OCR_FEATURE_NODES: A = OCR_NODE_IDS F = OCR_FEATURES nodes = { + A.FORM_RELATION_EXTRACTOR: OcrFeatureNode(A.FORM_RELATION_EXTRACTOR, [F.TEXT_ENTITY], + [F.VISUAL_RELATION]), + A.VISUAL_DOCUMENT_CLASSIFIER: OcrFeatureNode(A.VISUAL_DOCUMENT_CLASSIFIER, [F.HOCR], [F.VISUAL_CLASSIFIER_PREDICTION, F.VISUAL_CLASSIFIER_CONFIDENCE, F.FILE_PATH]), diff --git a/nlu/universe/feature_resolutions.py b/nlu/universe/feature_resolutions.py index 78f3a006..3c086036 100644 --- a/nlu/universe/feature_resolutions.py +++ b/nlu/universe/feature_resolutions.py @@ -116,5 +116,6 @@ class FeatureResolutions: ComponentUniverse.components[OCR_NODE_IDS.IMAGE_TABLE_CELL_DETECTOR]), OCR_FEATURES.TEXT_DOCUMENT_TOKENIZED: ResolvedFeature(OCR_NODE_IDS.HOCR_TOKENIZER, OCR_NODE_IDS.HOCR_TOKENIZER, 'xx', False, ComponentUniverse.components[OCR_NODE_IDS.HOCR_TOKENIZER]), - + OCR_FEATURES.TEXT_ENTITY: ResolvedFeature(OCR_NODE_IDS.VISUAL_DOCUMENT_NER, OCR_NODE_IDS.VISUAL_DOCUMENT_NER, + 'xx', False,ComponentUniverse.components[OCR_NODE_IDS.VISUAL_DOCUMENT_NER]), } diff --git a/nlu/universe/feature_universes.py b/nlu/universe/feature_universes.py index eaf7e49b..2d1c8aa8 100644 --- a/nlu/universe/feature_universes.py +++ b/nlu/universe/feature_universes.py @@ -129,7 +129,9 @@ class OCR_FEATURES(JslFeature): PREDICTION_CONFIDENCE = JslFeature("prediction_confidence") # TODO is this just int or some struct? VISUAL_CLASSIFIER_CONFIDENCE = JslFeature("visual_classifier_confidence") VISUAL_CLASSIFIER_PREDICTION = JslFeature("visual_classifier_prediction") + VISUAL_RELATION = JslFeature("visual_classifier_prediction") + FORM_RELATION = JslFeature('ocr_relations') class NLP_HC_FEATURES(JslFeature): """ diff --git a/tests/nlu_ocr_tests/cv_test.png b/tests/datasets/ocr/images/cv_test.png similarity index 100% rename from tests/nlu_ocr_tests/cv_test.png rename to tests/datasets/ocr/images/cv_test.png diff --git a/tests/datasets/ocr/images/form.png b/tests/datasets/ocr/images/form.png new file mode 100644 index 00000000..e69de29b diff --git a/tests/datasets/ocr/images/form2.png b/tests/datasets/ocr/images/form2.png new file mode 100644 index 00000000..e69de29b diff --git a/tests/nlu_ocr_tests/letter.jpg b/tests/datasets/ocr/images/letter.jpg similarity index 100% rename from tests/nlu_ocr_tests/letter.jpg rename to tests/datasets/ocr/images/letter.jpg diff --git a/tests/nlu_ocr_tests/ocr_form_relation_extractor.py b/tests/nlu_ocr_tests/ocr_form_relation_extractor.py new file mode 100644 index 00000000..90d0564c --- /dev/null +++ b/tests/nlu_ocr_tests/ocr_form_relation_extractor.py @@ -0,0 +1,27 @@ +import os +import sys + +sys.path.append(os.getcwd()) +import unittest +import nlu + +os.environ["PYTHONPATH"] = "F:/Work/repos/nlu" +os.environ['PYSPARK_PYTHON'] = sys.executable +os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable +from johnsnowlabs import nlp, visual + +# nlp.install(json_license_path='license.json',visual=True) +nlp.start(visual=True) + +class OcrTest(unittest.TestCase): + + def test_classify_document(self): + # nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) + # text that we generate PDF to has to come from an image struct! + # We need convert text to img struct! + p = nlu.load('visual_form_relation_extractor').predict(['tests/datasets/ocr/form.png','tests/datasets/ocr/form2.png']) + for df in p: + print(p.to_markdown()) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/nlu_ocr_tests/ocr_ner.png b/tests/nlu_ocr_tests/ocr_ner.png new file mode 100644 index 00000000..e69de29b