Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VisionEncoderDecoderForImageCaptioning Integration #229

Open
wants to merge 1 commit into
base: release/511
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from sparknlp.annotator import *
from sparknlp.base import *


class VisionEncoderDecoder:
@staticmethod
def get_default_model():
return VisionEncoderDecoderForImageCaptioning \
.pretrained() \
.setInputCols("image_assembler") \
.setOutputCol("caption")

@staticmethod
def get_pretrained_model(name, language, bucket=None):
return VisionEncoderDecoderForImageCaptioning \
.pretrained(name, language, bucket) \
.setInputCols("image_assembler") \
.setOutputCol("caption")





2 changes: 2 additions & 0 deletions nlu/spellbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -2446,6 +2446,7 @@ class Spellbook:
'en.seq2seq.distilbart_cnn_12_6': 'distilbart_cnn_12_6',
'en.seq2seq.distilbart_xsum_6_6': 'distilbart_xsum_6_6',
'en.classify_image.swin.tiny': 'image_classifier_swin_tiny_patch4_window7_224',
'en.classify_image.image_captioning_vit_gpt2': 'image_captioning_vit_gpt2',
'en.speech2text.hubert': 'asr_hubert_large_ls960',
'en.speech2text.hubert.large_ls960': 'asr_hubert_large_ls960',
'en.albert': 'albert_base_uncased',
Expand Down Expand Up @@ -19977,6 +19978,7 @@ class Spellbook:
'asr_hubert_large_ls960': 'HubertForCTC',
'image_classifier_swin_tiny_patch4_window7_224': 'SwinForImageClassification',
'camembert_base_qa_fquad': 'CamemBertForQuestionAnswering',
'image_captioning_vit_gpt2': 'VisionEncoderDecoderForImageCaptioning',

'summarizer_clinical_jsl': 'MedicalSummarizer',
'summarizer_clinical_jsl_augmented': 'MedicalSummarizer',
Expand Down
1 change: 1 addition & 0 deletions nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ class AnnoClassRef:
A_N.PARTIAL_Normalizer: 'Normalizer',
A_N.VIT_IMAGE_CLASSIFICATION: 'ViTForImageClassification',
A_N.CONVNEXT_IMAGE_CLASSIFICATION: 'ConvNextImageClassifier',
A_N.VISION_ENCODER_DECODER_FOR_IMAGE_CAPTIONING: 'VisionEncoderDecoderForImageCaptioning',

}
JSL_anno_HC_ref_2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {
Expand Down
25 changes: 25 additions & 0 deletions nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from nlu.components.classifiers.image_classification_swin.swin import SwinImageClassifier
from nlu.components.classifiers.image_classification_vit.convnext_image_classification import ConvNextImageClassifier
from nlu.components.classifiers.image_classification_vit.vit_image_classifier import VitImageClassifier
from nlu.components.classifiers.image_encoder_decoder.vision_encoder_decoder import VisionEncoderDecoder
from nlu.components.classifiers.language_detector.language_detector import LanguageDetector
from nlu.components.classifiers.multi_classifier.multi_classifier import MultiClassifier
from nlu.components.classifiers.named_entity_recognizer_crf.ner_crf import NERDLCRF
Expand Down Expand Up @@ -3374,6 +3375,30 @@ class ComponentUniverse:
requires_image_format=True,
is_visual_annotator=True,
),
A.VISION_ENCODER_DECODER_FOR_IMAGE_CAPTIONING: partial(NluComponent,
name=A.VISION_ENCODER_DECODER_FOR_IMAGE_CAPTIONING,
type=T.IMAGE_CLASSIFICATION,
get_default_model=VisionEncoderDecoder.get_default_model,
get_pretrained_model=VisionEncoderDecoder.get_pretrained_model,
pdf_extractor_methods={
'default': default_document_config,
'default_full': default_full_config},
pdf_col_name_substitutor=substitute_recognized_text_cols,
output_level=L.DOCUMENT,
node=NLP_FEATURE_NODES.nodes[
A.VISION_ENCODER_DECODER_FOR_IMAGE_CAPTIONING],
description='TODO',
provider=ComponentBackends.open_source,

license=Licenses.open_source,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=A.VISION_ENCODER_DECODER_FOR_IMAGE_CAPTIONING,
jsl_anno_py_class=ACR.JSL_anno2_py_class[
A.VISION_ENCODER_DECODER_FOR_IMAGE_CAPTIONING],
requires_image_format=True,
is_visual_annotator=True,
),

A.IMAGE_ASSEMBLER: partial(NluComponent,
name=A.IMAGE_ASSEMBLER,
Expand Down
1 change: 1 addition & 0 deletions nlu/universe/feature_node_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class NLP_NODE_IDS:
VIT_IMAGE_CLASSIFICATION = JslAnnoId("vit_image_classification")
CONVNEXT_IMAGE_CLASSIFICATION = JslAnnoId("convnext_image_classification")
SWIN_IMAGE_CLASSIFICATION = JslAnnoId("swin_image_classification")
VISION_ENCODER_DECODER_FOR_IMAGE_CAPTIONING = JslAnnoId("vision_encoder_decoder_for_image_captioning")
BART_TRANSFORMER = JslAnnoId("bart_transformer")
INSTRUCTOR_SENTENCE_EMBEDDINGS = JslAnnoId('instructor_sentence_embeddings')

Expand Down
1 change: 1 addition & 0 deletions nlu/universe/feature_node_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ class NLP_FEATURE_NODES: # or Mode Node?
A.VIT_IMAGE_CLASSIFICATION: NlpFeatureNode(A.VIT_IMAGE_CLASSIFICATION, [F.IMAGE], [F.CLASSIFIED_IMAGE]),
A.CONVNEXT_IMAGE_CLASSIFICATION: NlpFeatureNode(A.CONVNEXT_IMAGE_CLASSIFICATION, [F.IMAGE], [F.CLASSIFIED_IMAGE]),
A.SWIN_IMAGE_CLASSIFICATION: NlpFeatureNode(A.SWIN_IMAGE_CLASSIFICATION, [F.IMAGE], [F.CLASSIFIED_IMAGE]),
A.VISION_ENCODER_DECODER_FOR_IMAGE_CAPTIONING: NlpFeatureNode(A.VISION_ENCODER_DECODER_FOR_IMAGE_CAPTIONING, [F.IMAGE], [F.CLASSIFIED_IMAGE]),
A.BART_TRANSFORMER: NlpFeatureNode(A.BART_TRANSFORMER, [F.DOCUMENT], [F.DOCUMENT_GENERATED]),

}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import unittest

from nlu import *


class VisionEncoderDecoderTest(unittest.TestCase):
def test_image_captioning_vit_gpt2_model(self):
df = nlu.load("en.classify_image.image_captioning_vit_gpt2").predict([r'./../../../datasets/ocr/vit/general_images/images'])
print(df)

if __name__ == "__main__":
unittest.main()