diff --git a/examples/colab/component_examples/classifiers/Bart_Zero_Shot_Classifier.ipynb b/examples/colab/component_examples/classifiers/Bart_Zero_Shot_Classifier.ipynb new file mode 100644 index 00000000..de35671b --- /dev/null +++ b/examples/colab/component_examples/classifiers/Bart_Zero_Shot_Classifier.ipynb @@ -0,0 +1,313 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" + ], + "metadata": { + "id": "7A9NQR0tVbWf" + } + }, + { + "cell_type": "markdown", + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/https://github.com/JohnSnowLabs/nlu/tree/master/examples/colab/component_examples/classifiers/Bart_Zero_Shot_Classifiers.ipynb)" + ], + "metadata": { + "id": "XCxDeiyZxNyV" + } + }, + { + "cell_type": "markdown", + "source": [ + "### **Zero Shot Classifiers**" + ], + "metadata": { + "id": "ba7qk8Dwxc29" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Zero Shot Text Classification\n", + "\n", + "State-of-the-art NLP models for text classification without annotated data\n", + "\n", + "Natural language processing is a very exciting field right now. In recent years, the community has begun to figure out some pretty effective methods of learning from the enormous amounts of unlabeled data available on the internet. The success of transfer learning from unsupervised models has allowed us to surpass virtually all existing benchmarks on downstream supervised learning tasks. As we continue to develop new model architectures and unsupervised learning objectives, \"state of the art\" continues to be a rapidly moving target for many tasks where large amounts of labeled data are available.\n", + "\n", + "### Zero Shot learning\n", + "\n", + "Zero-shot Learning (ZSL) is one of the most recent advancements in Machine Learning aimed to train Deep Neural Network models to have higher generalisability on unseen data. One of the most prominent methods of training such models is to use text prompts that explain the task to be solved, along with all possible outputs.\n", + "\n", + "The primary aim of using ZSL over supervised learning is to address the following limitations of training traditional supervised learning models:\n", + "\n", + "1. Training supervised NLP models require substantial amount of training data.\n", + "2. Even with recent trend of fine-tuning large language models, the supervised approach of training or fine-tuning a model is basically to learn a very specific data distribution, which results in low performance when applied to diverse and unseen data.\n", + "3. The classical annotate-train-test cycle is highly demanding in terms of temporal and human resources." + ], + "metadata": { + "id": "VOktZCAgxffG" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Bart Zero Shot Classifier\n", + "\n", + "This model is intended to be used for zero-shot text classification, especially in English. It is fine-tuned on MNLI by using large BART model.\n", + "\n", + "BartForZeroShotClassification using a ModelForSequenceClassification trained on MNLI tasks. Equivalent of BartForSequenceClassification models, but these models don’t require a hardcoded number of potential classes, they can be chosen at runtime. It usually means it’s slower but it is much more flexible.\n", + "\n", + "We used TFBartForSequenceClassification to train this model and used BartForZeroShotClassification annotator in Spark NLP 🚀 for prediction at scale" + ], + "metadata": { + "id": "MvfFWlHrxmGQ" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8w2RtQGCU_Xg" + }, + "outputs": [], + "source": [ + "!pip install nlu\n", + "!pip install pyspark==3.4.1" + ] + }, + { + "cell_type": "code", + "source": [ + "import nlu\n", + "import pandas as pd" + ], + "metadata": { + "id": "mU_7-Y4nVZXA" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "text = ['I have a problem with my hotel reservation that needs to be resolved asap!!']" + ], + "metadata": { + "id": "Bn1xHZfGVqJA" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "bart_zero_shot = nlu.load('en.bart.zero_shot_classifier')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "29Q7-Riqw74U", + "outputId": "e7ed4737-efbb-48cc-fbdd-40d01545cb27" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "bart_large_zero_shot_classifier_mnli download started this may take some time.\n", + "Approximate size to download 445.4 MB\n", + "[OK!]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "results = bart_zero_shot.predict(text, output_level = 'document')" + ], + "metadata": { + "id": "34efPvSQw9cl" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "results" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 + }, + "id": "s75RpDk6w-5f", + "outputId": "8cd5cdf1-2982-478b-a05e-819c0bd4b2d2" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " classified_sequence classified_sequence_confidence \\\n", + "0 [travel] [0.12591693] \n", + "\n", + " document \n", + "0 I have a problem with my hotel reservation tha... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classified_sequenceclassified_sequence_confidencedocument
0[travel][0.12591693]I have a problem with my hotel reservation tha...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "UgFt5oRbogZw" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/examples/colab/component_examples/classifiers/NLU_DeBertaForZeroShotClassification.ipynb b/examples/colab/component_examples/classifiers/NLU_DeBertaForZeroShotClassification.ipynb new file mode 100644 index 00000000..d3a24ff7 --- /dev/null +++ b/examples/colab/component_examples/classifiers/NLU_DeBertaForZeroShotClassification.ipynb @@ -0,0 +1,504 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples/colab/component_examples/classifiers/NLU_DeBertaForZeroShotClassification.ipynb)\n", + "\n", + "# DeBertaForZeroShotClassification\n", + "\n", + "DeBertaForZeroShotClassification annotator, leveraging the DeBERTa architecture, introduces sophisticated zero-shot classification capabilities, enabling the classification of text into predefined classes without direct example training.\n", + "\n", + "## DeBERTa Model Overview\n", + "\n", + "The DeBERTa model, standing for \"Decoding-enhanced BERT with disentangled attention,\" marks a pivotal step forward in the natural language processing landscape. As elucidated in the paper *Building Efficient Universal Classifiers with Natural Language Inference* by Moritz Laurer et al., DeBERTa transcends the typical BERT-like architecture by incorporating a unique disentangled attention mechanism. This enhancement deepens its understanding of the nuanced relationships between tokens at varying positions.\n", + "\n", + "DeBERTa's training involved a combination of Natural Language Inference (NLI) datasets alongside a diverse array of non-NLI datasets, culminating in its remarkable zero-shot classification abilities. The model's exposure to 33 datasets encompassing 389 classes empowers it with the proficiency to classify text into unseen categories, thus exhibiting significant performance boosts and heightened efficiency over generative Large Language Models.\n", + "\n", + "## DeBertaForZeroShotClassification in Spark NLP\n", + "\n", + "Within the Spark NLP suite, the `DeBertaForZeroShotClassification` annotator emerges as a potent and adaptable instrument for text classification tasks, drawing upon the zero-shot learning prowess of the DeBERTa model. This annotator distinguishes itself from conventional models by eschewing the need for a fixed number of classes. Instead, it endows users with the capability to dynamically define classes at runtime, thereby allowing for the classification of texts against an arbitrary set of labels without necessitating model retraining. While this dynamic classification approach may introduce a slight delay due to runtime class definition, it offers unparalleled flexibility.\n", + "\n", + "Leveraging the `DeBertaForZeroShotClassification` annotator powered by Spark NLP 🚀 enables predictions at scale, ensuring that state-of-the-art text classification is both accessible and efficient.\n", + "\n", + "## Reference: [DeBerta](https://arxiv.org/pdf/2312.17543.pdf)\n", + "\n", + "### Paper Abstract\n", + "\n", + "Generative Large Language Models (LLMs) have risen to prominence as the preferred method for few-shot and zero-shot learning due to their text generation universality. However, not all users require the expansive capabilities of generative LLMs, especially when their focus is solely on automating classification tasks. In such cases, smaller BERT-like models have proven to be a more efficient alternative, capable of learning universal tasks and performing any text classification task in a zero-shot manner—without the need for fine-tuning—or with minimal examples (few-shot). This paper makes significant contributions by:\n", + "\n", + "1. **Explaining** how Natural Language Inference (NLI) can be harnessed as a universal classification task, aligning with the principles used for instruction fine-tuning in generative LLMs.\n", + "2. **Providing** a detailed, step-by-step guide complemented by reusable Jupyter notebooks to facilitate the construction of a universal classifier.\n", + "3. **Sharing** the fruits of this methodology—a universal classifier trained on 33 datasets encompassing 389 varied classes. This classifier not only builds on the foundation of previously developed zero-shot classifiers, which have been downloaded over 55 million times from the Hugging Face Hub, but also enhances zero-shot performance by an impressive 9.4%.\n", + "\n", + "\n" + ], + "metadata": { + "id": "d90kTce5qXik" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gmRrcVRSibm1" + }, + "outputs": [], + "source": [ + "! pip install spark-nlp==5.3.0\n", + "! pip install nlu pyspark==3.1.2" + ] + }, + { + "cell_type": "code", + "source": [ + "import nlu" + ], + "metadata": { + "id": "p6JuvMXoikuj" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "nlu.__file__" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "Lw6HdIiEi2Rq", + "outputId": "91f73bd3-b47f-4210-ff7a-5044cc71ec7a" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/usr/local/lib/python3.10/dist-packages/nlu/__init__.py'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model = nlu.load('en.deberta.zero_shot_classifier')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "C-mYZzfxi5TI", + "outputId": "340740fa-a52f-4bd6-8449-c77a11c9362a" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "deberta_base_zero_shot_classifier_mnli_anli_v3 download started this may take some time.\n", + "Approximate size to download 420.7 MB\n", + "[OK!]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "text = [\"I loved this movie when I was a child.\", \"It was pretty boring.\"]" + ], + "metadata": { + "id": "4Dofd4wpn9jY" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df = model.predict(text)" + ], + "metadata": { + "id": "_ORx66_woC_z" + }, + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 125 + }, + "id": "sEgeGDPToieB", + "outputId": "8711a3d8-6492-4b27-a2ef-d620dc5db66e" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " classified_sequence classified_sequence_confidence \\\n", + "0 music 0.211335 \n", + "1 weather 0.161989 \n", + "\n", + " sentence \n", + "0 I loved this movie when I was a child. \n", + "1 It was pretty boring. " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classified_sequenceclassified_sequence_confidencesentence
0music0.211335I loved this movie when I was a child.
1weather0.161989It was pretty boring.
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "repr_error": "'str' object has no attribute 'empty'" + } + }, + "metadata": {}, + "execution_count": 12 + } + ] + } + ] +} \ No newline at end of file diff --git a/examples/colab/component_examples/classifiers/XlmRoberta_Zero_Shot_Classifier.ipynb b/examples/colab/component_examples/classifiers/XlmRoberta_Zero_Shot_Classifier.ipynb new file mode 100644 index 00000000..99f31cfd --- /dev/null +++ b/examples/colab/component_examples/classifiers/XlmRoberta_Zero_Shot_Classifier.ipynb @@ -0,0 +1,511 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" + ], + "metadata": { + "id": "7A9NQR0tVbWf" + } + }, + { + "cell_type": "markdown", + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/https://github.com/JohnSnowLabs/nlu/tree/master/examples/colab/component_examples/classifiers/Bert_Zero_Shot_Classifiers.ipynb)" + ], + "metadata": { + "id": "XCxDeiyZxNyV" + } + }, + { + "cell_type": "markdown", + "source": [ + "### **Zero Shot Classifiers**" + ], + "metadata": { + "id": "ba7qk8Dwxc29" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Zero Shot Text Classification\n", + "\n", + "State-of-the-art NLP models for text classification without annotated data\n", + "\n", + "Natural language processing is a very exciting field right now. In recent years, the community has begun to figure out some pretty effective methods of learning from the enormous amounts of unlabeled data available on the internet. The success of transfer learning from unsupervised models has allowed us to surpass virtually all existing benchmarks on downstream supervised learning tasks. As we continue to develop new model architectures and unsupervised learning objectives, \"state of the art\" continues to be a rapidly moving target for many tasks where large amounts of labeled data are available.\n", + "\n", + "### Zero Shot learning\n", + "\n", + "Zero-shot Learning (ZSL) is one of the most recent advancements in Machine Learning aimed to train Deep Neural Network models to have higher generalisability on unseen data. One of the most prominent methods of training such models is to use text prompts that explain the task to be solved, along with all possible outputs.\n", + "\n", + "The primary aim of using ZSL over supervised learning is to address the following limitations of training traditional supervised learning models:\n", + "\n", + "1. Training supervised NLP models require substantial amount of training data.\n", + "2. Even with recent trend of fine-tuning large language models, the supervised approach of training or fine-tuning a model is basically to learn a very specific data distribution, which results in low performance when applied to diverse and unseen data.\n", + "3. The classical annotate-train-test cycle is highly demanding in terms of temporal and human resources." + ], + "metadata": { + "id": "VOktZCAgxffG" + } + }, + { + "cell_type": "markdown", + "source": [ + "### XlmRoberta Zero Shot Classifier\n", + "\n", + "This model is intended to be used for zero-shot text classification, especially in English. It is fine-tuned on NLI by using XlmRoberta Large model.\n", + "\n", + "XlmRoBertaForZeroShotClassificationusing a ModelForSequenceClassification trained on NLI (natural language inference) tasks. Equivalent of TFXLMRoBertaForZeroShotClassification models, but these models don’t require a hardcoded number of potential classes, they can be chosen at runtime. It usually means it’s slower but it is much more flexible.\n", + "\n", + "We used TFXLMRobertaForSequenceClassification to train this model and used XlmRoBertaForZeroShotClassification annotator in Spark NLP 🚀 for prediction at scale!" + ], + "metadata": { + "id": "MvfFWlHrxmGQ" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8w2RtQGCU_Xg" + }, + "outputs": [], + "source": [ + "!pip install nlu\n", + "!pip install pyspark==3.4.1" + ] + }, + { + "cell_type": "code", + "source": [ + "import nlu\n", + "import pandas as pd" + ], + "metadata": { + "id": "mU_7-Y4nVZXA" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "text = [\"I have a problem with my iphone that needs to be resolved asap.\",\n", + " \"Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.\",\n", + " \"Ich habe diesen Film geliebt, als ich ein Kind war.\",\n", + " \"I really want to visit Germany and I am planning to go there next year.\",\n", + " \"I loved this movie when I was a child.\",\n", + " \"I always hated this movie and it's plot.\",\n", + " \"Deplasmanda kazanmak çok mutluluk verici.\"\n", + " ]" + ], + "metadata": { + "id": "Bn1xHZfGVqJA" + }, + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "xlm_roberta_zero_shot = nlu.load('xx.xlm_roberta.zero_shot_classifier')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "29Q7-Riqw74U", + "outputId": "cd297f3b-aafa-480f-9664-2d353ce2c192" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "Warning::Spark Session already created, some configs may not take.\n", + "xlm_roberta_large_zero_shot_classifier_xnli_anli download started this may take some time.\n", + "Approximate size to download 1.8 GB\n", + "[OK!]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "results = xlm_roberta_zero_shot.predict(text)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "34efPvSQw9cl", + "outputId": "acb15dc8-e88b-4543-f1bd-84fc8b83ff60" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "sentence_detector_dl download started this may take some time.\n", + "Approximate size to download 354.6 KB\n", + "[OK!]\n", + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "results" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 + }, + "id": "s75RpDk6w-5f", + "outputId": "ff3f891f-fd35-4e5e-cf4b-0521151e1c59" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " classified_sequence classified_sequence_confidence \\\n", + "0 urgent 0.509167 \n", + "1 technology 0.700567 \n", + "2 movie 0.896203 \n", + "3 travel 0.950424 \n", + "4 movie 0.701275 \n", + "5 movie 0.899282 \n", + "6 sport 0.794113 \n", + "\n", + " sentence \n", + "0 I have a problem with my iphone that needs to ... \n", + "1 Last week I upgraded my iOS version and ever s... \n", + "2 Ich habe diesen Film geliebt, als ich ein Kind... \n", + "3 I really want to visit Germany and I am planni... \n", + "4 I loved this movie when I was a child. \n", + "5 I always hated this movie and it's plot. \n", + "6 Deplasmanda kazanmak çok mutluluk verici. " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classified_sequenceclassified_sequence_confidencesentence
0urgent0.509167I have a problem with my iphone that needs to ...
1technology0.700567Last week I upgraded my iOS version and ever s...
2movie0.896203Ich habe diesen Film geliebt, als ich ein Kind...
3travel0.950424I really want to visit Germany and I am planni...
4movie0.701275I loved this movie when I was a child.
5movie0.899282I always hated this movie and it's plot.
6sport0.794113Deplasmanda kazanmak çok mutluluk verici.
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "Dd8tkxNX798r" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/examples/colab/component_examples/named_entity_recognition_NER/NLU_ner_protein_glove_en.ipynb b/examples/colab/component_examples/named_entity_recognition_NER/NLU_ner_protein_glove_en.ipynb new file mode 100644 index 00000000..ee448d7a --- /dev/null +++ b/examples/colab/component_examples/named_entity_recognition_NER/NLU_ner_protein_glove_en.ipynb @@ -0,0 +1,902 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples/colab/component_examples/named_entity_recognition_NER/NLU_ner_protein_glove_en.ipynb)\n", + "\n", + "# Detect Biomedical Entities in English\n", + "\n", + "Named Entity Recognition model that finds `Protein` entitites in biomedical texts." + ], + "metadata": { + "id": "7Rea5AhgFJD2" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3AspTpJootr7" + }, + "outputs": [], + "source": [ + "! pip install nlu pyspark==3.1.2" + ] + }, + { + "cell_type": "code", + "source": [ + "import nlu" + ], + "metadata": { + "id": "szEHu51ysEJ5" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "nlu.__file__" + ], + "metadata": { + "id": "wrfjogsrsZ4i", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "outputId": "e5c14692-86e6-4ce6-94e8-a8faee83e2cb" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/usr/local/lib/python3.10/dist-packages/nlu/__init__.py'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model = nlu.load('en.ner.dl.protein_glove')" + ], + "metadata": { + "id": "a4CG3_h-scWG", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "97401f2d-a14b-4633-a9e7-6f80c05810f2" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "ner_wikiner_glove_840B_300 download started this may take some time.\n", + "Approximate size to download 14.8 MB\n", + "[OK!]\n", + "glove_840B_300 download started this may take some time.\n", + "Approximate size to download 2.3 GB\n", + "[OK!]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Your text input\n", + "text = ['''\n", + "MACROPHAGES ARE MONONUCLEAR phagocytes that reside within almost all tissues including adipose tissue, where they are identifiable as distinct populations with tissue-specific morphology, localization, and function (1). During the process of atherosclerosis, monocytes adhere to the endothelium and migrate into the intima, express scavenger receptors, and bind internalized lipoprotein particles resulting in the formation of foam cells (2). In obesity, adipose tissue contains an increased number of resident macrophages (3, 4). Macrophage accumulation in proportion to adipocyte size may increase the adipose tissue production of proinflammatory and acute-phase molecules and thereby contribute to the pathophysiological consequences of obesity (1, 3). These facts indicate that macrophages play an important role in a variety of diseases. When activated, macrophages release stereotypical profiles of cytokines and biological molecules such as nitric oxide TNF-α, IL-6, and IL-1 (5). TNF-α is a potent chemoattractant (6) and originates predominantly from residing mouse peritoneal macrophages (MPM) and mast cells (7). TNF-α induces leukocyte adhesion and degranulation, stimulates nicotinamide adenine dinucleotide phosphate (NADPH) oxidase, and enhances expression of IL-2 receptors and expression of E-selectin and intercellular adhesion molecules on the endothelium (8). TNF-α also stimulates expression of IL-1, IL-2, IL-6, and platelet-activating factor receptor (9). In addition, TNF-α decreases insulin sensitivity and increases lipolysis in adipocytes (10, 11). IL-6 also increase lipolysis and has been implicated in the hypertriglyceridemia and increased serum free fatty acid levels associated with obesity (12). Increased IL-6 signaling induces the expression of C-reactive protein and haptoglubin in liver (13). Recombinant IL-6 treatment increases atherosclerotic lesion size 5-fold (14). IL-6 also dose-dependently increases macrophage oxidative low-density lipoprotein (LDL) degradation and CD36 mRNA expression in vitro (15). These data clearly indicate that IL-6 and TNF-α are important pathogenetic factors associated with obesity, insulin resistance, and atherosclerosis. However, the factors regulating gene expression of these cytokines in macrophages have not been fully clarified.\n", + "''']" + ], + "metadata": { + "id": "2pR5LdKftR5C" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Perform prediction\n", + "ner_df = model.predict(text, output_level=\"chunk\")" + ], + "metadata": { + "id": "DICL2UfbtTrS", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "17db64f4-b3af-4842-8007-b6c1bfc41d16" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "ner_df" + ], + "metadata": { + "id": "p9jPdPKntV70", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 573 + }, + "outputId": "0cc84401-e036-4405-831d-592bf9d9f8d6" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " document \\\n", + "0 MACROPHAGES ARE MONONUCLEAR phagocytes that re... \n", + "0 MACROPHAGES ARE MONONUCLEAR phagocytes that re... \n", + "0 MACROPHAGES ARE MONONUCLEAR phagocytes that re... \n", + "0 MACROPHAGES ARE MONONUCLEAR phagocytes that re... \n", + "0 MACROPHAGES ARE MONONUCLEAR phagocytes that re... \n", + "\n", + " entities_wikiner_glove_840B_300 entities_wikiner_glove_840B_300_class \\\n", + "0 TNF-α MISC \n", + "0 IL-2 MISC \n", + "0 IL-2 MISC \n", + "0 CD36 MISC \n", + "0 TNF-α MISC \n", + "\n", + " entities_wikiner_glove_840B_300_confidence \\\n", + "0 0.8862 \n", + "0 0.935 \n", + "0 0.9646 \n", + "0 0.8621 \n", + "0 0.8793 \n", + "\n", + " entities_wikiner_glove_840B_300_origin_chunk \\\n", + "0 0 \n", + "0 1 \n", + "0 2 \n", + "0 3 \n", + "0 4 \n", + "\n", + " entities_wikiner_glove_840B_300_origin_sentence \\\n", + "0 0 \n", + "0 0 \n", + "0 0 \n", + "0 0 \n", + "0 0 \n", + "\n", + " word_embedding_glove \n", + "0 [[0.8373799920082092, -0.6678000092506409, 0.0... \n", + "0 [[0.8373799920082092, -0.6678000092506409, 0.0... \n", + "0 [[0.8373799920082092, -0.6678000092506409, 0.0... \n", + "0 [[0.8373799920082092, -0.6678000092506409, 0.0... \n", + "0 [[0.8373799920082092, -0.6678000092506409, 0.0... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
documententities_wikiner_glove_840B_300entities_wikiner_glove_840B_300_classentities_wikiner_glove_840B_300_confidenceentities_wikiner_glove_840B_300_origin_chunkentities_wikiner_glove_840B_300_origin_sentenceword_embedding_glove
0MACROPHAGES ARE MONONUCLEAR phagocytes that re...TNF-αMISC0.886200[[0.8373799920082092, -0.6678000092506409, 0.0...
0MACROPHAGES ARE MONONUCLEAR phagocytes that re...IL-2MISC0.93510[[0.8373799920082092, -0.6678000092506409, 0.0...
0MACROPHAGES ARE MONONUCLEAR phagocytes that re...IL-2MISC0.964620[[0.8373799920082092, -0.6678000092506409, 0.0...
0MACROPHAGES ARE MONONUCLEAR phagocytes that re...CD36MISC0.862130[[0.8373799920082092, -0.6678000092506409, 0.0...
0MACROPHAGES ARE MONONUCLEAR phagocytes that re...TNF-αMISC0.879340[[0.8373799920082092, -0.6678000092506409, 0.0...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "ner_df", + "repr_error": "unhashable type: 'numpy.ndarray'" + } + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Print the desired output\n", + "print(ner_df[[\"entities_wikiner_glove_840B_300\", \"entities_wikiner_glove_840B_300_confidence\"]])" + ], + "metadata": { + "id": "rdUQQBC9ujkF", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "066bf185-0fe3-4b36-d9b9-3f74b5b03a5e" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " entities_wikiner_glove_840B_300 entities_wikiner_glove_840B_300_confidence\n", + "0 TNF-α 0.8862\n", + "0 IL-2 0.935\n", + "0 IL-2 0.9646\n", + "0 CD36 0.8621\n", + "0 TNF-α 0.8793\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "ner_df[[\"entities_wikiner_glove_840B_300\", \"entities_wikiner_glove_840B_300_confidence\"]]" + ], + "metadata": { + "id": "mqFQYX91tOuw", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "outputId": "5891f2f4-c104-4efd-8034-2e7585b5398b" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " entities_wikiner_glove_840B_300 entities_wikiner_glove_840B_300_confidence\n", + "0 TNF-α 0.8862\n", + "0 IL-2 0.935\n", + "0 IL-2 0.9646\n", + "0 CD36 0.8621\n", + "0 TNF-α 0.8793" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
entities_wikiner_glove_840B_300entities_wikiner_glove_840B_300_confidence
0TNF-α0.8862
0IL-20.935
0IL-20.9646
0CD360.8621
0TNF-α0.8793
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"ner_df[[\\\"entities_wikiner_glove_840B_300\\\", \\\"entities_wikiner_glove_840B_300_confidence\\\"]]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"entities_wikiner_glove_840B_300\",\n \"properties\": {\n \"dtype\": \"string\",\n \"samples\": [\n \"TNF-\\u03b1\",\n \"IL-2\",\n \"CD36\"\n ],\n \"num_unique_values\": 3,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"entities_wikiner_glove_840B_300_confidence\",\n \"properties\": {\n \"dtype\": \"string\",\n \"samples\": [\n \"0.935\",\n \"0.8793\",\n \"0.9646\"\n ],\n \"num_unique_values\": 5,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 9 + } + ] + } + ] +} \ No newline at end of file diff --git a/examples/colab/component_examples/sentence_embeddings/NLU_BGE_sentence_embeddings.ipynb b/examples/colab/component_examples/sentence_embeddings/NLU_BGE_sentence_embeddings.ipynb new file mode 100644 index 00000000..b2893149 --- /dev/null +++ b/examples/colab/component_examples/sentence_embeddings/NLU_BGE_sentence_embeddings.ipynb @@ -0,0 +1,1157 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "rBXrqlGEYA8G" + }, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples/colab/component_examples/sentence_embeddings/NLU_E5_sentence_embeddings.ipynb)\n", + "\n", + "# BGE Sentence Embeddings with NLU\n", + "\n", + " BGE, or BAAI General Embeddings, a model that can map any text to a low-dimensional dense\n", + " vector which can be used for tasks like retrieval, classification, clustering, or semantic search. And it also can be used in vector database for LLMs.\n", + "\n", + "## Sources :\n", + "- https://arxiv.org/pdf/2309.07597.pdf\n", + "- https://github.com/FlagOpen/FlagEmbedding\n", + "\n", + "## Paper abstract\n", + "\n", + "This paper introduces C-Pack, a package of resources that significantly advance the field of general\n", + " Chinese embeddings. C-Pack includes three critical resources.\n", + " 1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets.\n", + " 2) C-MTP is a massive text embedding dataset curated from labeled and unlabeled Chinese corpora\n", + " for training embedding models.\n", + " 3) C-TEM is a family of embedding models covering multiple sizes.\n", + " Our models outperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the\n", + " time of the release. We also integrate and optimize the entire suite of training methods for\n", + " C-TEM. Along with our resources on general Chinese embedding, we release our data and models for\n", + " English text embeddings. The English models achieve stateof-the-art performance on the MTEB\n", + " benchmark; meanwhile, our released English data is 2 times larger than the Chinese data.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pc-VxiUuks79" + }, + "source": [ + "**All the available models:**\n", + "\n", + "| Language | nlu.load() reference | Spark NLP Model reference |\n", + "|----------|---------------------------------|-----------------------------------------------------------------------------------------------|\n", + "| English | en.embed_sentence.bge_small \t | [bge_small](https://sparknlp.org/2024/01/01/bge_small_en.html) \t\t\t\t\t |\n", + "| English | en.embed_sentence.bge_base | [bge_base](https://sparknlp.org/2024/01/01/bge_base_en.html) \t |\n", + "| English | en.embed_sentence.bge_large | [bge_large](https://sparknlp.org/2024/01/01/bge_large_en.html) \t |\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "# 1. Install NLU" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SAdkGaH7lyEi" + }, + "outputs": [], + "source": [ + "!pip install nlu pyspark==3.4.1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N_CL8HZ8Ydry" + }, + "source": [ + "# 2. Load Model and embed sample sentence" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6gWpe1M5fIoB" + }, + "source": [ + "### en.embed_sentence.bge_small" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "j2ZZZvr1uGpx", + "outputId": "097aa80e-46f6-49f4-d2c5-fd970f729a55" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "Warning::Spark Session already created, some configs may not take.\n", + "bge_small download started this may take some time.\n", + "Approximate size to download 76.1 MB\n", + "[OK!]\n", + "sentence_detector_dl download started this may take some time.\n", + "Approximate size to download 354.6 KB\n", + "[OK!]\n", + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ], + "source": [ + "import nlu\n", + "\n", + "res = nlu.load(\"en.embed_sentence.bge_small\").predict('query: how much protein should a female eat')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 89 + }, + "id": "QFJshD-4rdor", + "outputId": "9e5785bb-6418-4f30-d1e4-12ee456503ad" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " sentence \\\n", + "0 query: how much protein should a female eat \n", + "\n", + " sentence_embedding_bge_small \n", + "0 [-0.059140872210264206, -0.013027993030846119,... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentencesentence_embedding_bge_small
0query: how much protein should a female eat[-0.059140872210264206, -0.013027993030846119,...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "res" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XuzOX2d3fUAI" + }, + "source": [ + "### en.embed_sentence.bge_base" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3Ian3US8sUuw", + "outputId": "1f0c9b99-315f-47c4-9117-7d75a82b1e53" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "Warning::Spark Session already created, some configs may not take.\n", + "bge_base download started this may take some time.\n", + "Approximate size to download 246.7 MB\n", + "[OK!]\n", + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ], + "source": [ + "res = nlu.load('en.embed_sentence.bge_base').predict(\"passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.\",\n", + " output_level='document') # output_level should defined as document to get the embedding of the document instead of each sentence separately." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 89 + }, + "id": "eSpFlZdQeUHJ", + "outputId": "01bf348f-9c44-4181-f00d-e521c62fd6b5" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " document \\\n", + "0 passage: As a general guideline, the CDC's ave... \n", + "\n", + " sentence_embedding_bge_base \n", + "0 [0.006804925389587879, -0.006068557035177946, ... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
documentsentence_embedding_bge_base
0passage: As a general guideline, the CDC's ave...[0.006804925389587879, -0.006068557035177946, ...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "res" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BAUFklCqLr3V" + }, + "source": [ + "# 3. NLU has many more sentence embedding models!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3aiiLjYilt1a" + }, + "source": [ + "Make sure to try them all out!\n", + "You can change 'embed_sentence.electra' in nlu.load('embed_sentence.electra') to bert, xlnet, albert or any other of the 20+ sentence embeddings offerd by NLU" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9qUF7jPlme-R", + "outputId": "02df4660-d777-4766-fe0d-15d245e5668a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "For language NLU provides the following Models : \n", + "nlu.load('am.embed_sentence.xlm_roberta') returns Spark NLP model_anno_obj sent_xlm_roberta_base_finetuned_amharic\n", + "For language NLU provides the following Models : \n", + "nlu.load('de.embed_sentence.bert.base_cased') returns Spark NLP model_anno_obj sent_bert_base_cased\n", + "For language NLU provides the following Models : \n", + "nlu.load('el.embed_sentence.bert.base_uncased') returns Spark NLP model_anno_obj sent_bert_base_uncased\n", + "For language NLU provides the following Models : \n", + "nlu.load('en.embed_sentence') returns Spark NLP model_anno_obj tfhub_use\n", + "nlu.load('en.embed_sentence.albert') returns Spark NLP model_anno_obj albert_base_uncased\n", + "nlu.load('en.embed_sentence.bert') returns Spark NLP model_anno_obj sent_bert_base_uncased\n", + "nlu.load('en.embed_sentence.bert.base_uncased_legal') returns Spark NLP model_anno_obj sent_bert_base_uncased_legal\n", + "nlu.load('en.embed_sentence.bert.finetuned') returns Spark NLP model_anno_obj sbert_setfit_finetuned_financial_text_classification\n", + "nlu.load('en.embed_sentence.bert.pubmed') returns Spark NLP model_anno_obj sent_bert_pubmed\n", + "nlu.load('en.embed_sentence.bert.pubmed_squad2') returns Spark NLP model_anno_obj sent_bert_pubmed_squad2\n", + "nlu.load('en.embed_sentence.bert.wiki_books') returns Spark NLP model_anno_obj sent_bert_wiki_books\n", + "nlu.load('en.embed_sentence.bert.wiki_books_mnli') returns Spark NLP model_anno_obj sent_bert_wiki_books_mnli\n", + "nlu.load('en.embed_sentence.bert.wiki_books_qnli') returns Spark NLP model_anno_obj sent_bert_wiki_books_qnli\n", + "nlu.load('en.embed_sentence.bert.wiki_books_qqp') returns Spark NLP model_anno_obj sent_bert_wiki_books_qqp\n", + "nlu.load('en.embed_sentence.bert.wiki_books_squad2') returns Spark NLP model_anno_obj sent_bert_wiki_books_squad2\n", + "nlu.load('en.embed_sentence.bert.wiki_books_sst2') returns Spark NLP model_anno_obj sent_bert_wiki_books_sst2\n", + "nlu.load('en.embed_sentence.bert_base_cased') returns Spark NLP model_anno_obj sent_bert_base_cased\n", + "nlu.load('en.embed_sentence.bert_base_uncased') returns Spark NLP model_anno_obj sent_bert_base_uncased\n", + "nlu.load('en.embed_sentence.bert_large_cased') returns Spark NLP model_anno_obj sent_bert_large_cased\n", + "nlu.load('en.embed_sentence.bert_large_uncased') returns Spark NLP model_anno_obj sent_bert_large_uncased\n", + "nlu.load('en.embed_sentence.bert_use_cmlm_en_base') returns Spark NLP model_anno_obj sent_bert_use_cmlm_en_base\n", + "nlu.load('en.embed_sentence.bert_use_cmlm_en_large') returns Spark NLP model_anno_obj sent_bert_use_cmlm_en_large\n", + "nlu.load('en.embed_sentence.biobert.clinical_base_cased') returns Spark NLP model_anno_obj sent_biobert_clinical_base_cased\n", + "nlu.load('en.embed_sentence.biobert.discharge_base_cased') returns Spark NLP model_anno_obj sent_biobert_discharge_base_cased\n", + "nlu.load('en.embed_sentence.biobert.pmc_base_cased') returns Spark NLP model_anno_obj sent_biobert_pmc_base_cased\n", + "nlu.load('en.embed_sentence.biobert.pubmed_base_cased') returns Spark NLP model_anno_obj sent_biobert_pubmed_base_cased\n", + "nlu.load('en.embed_sentence.biobert.pubmed_large_cased') returns Spark NLP model_anno_obj sent_biobert_pubmed_large_cased\n", + "nlu.load('en.embed_sentence.biobert.pubmed_pmc_base_cased') returns Spark NLP model_anno_obj sent_biobert_pubmed_pmc_base_cased\n", + "nlu.load('en.embed_sentence.bge_base') returns Spark NLP model_anno_obj bge_base\n", + "nlu.load('en.embed_sentence.bge_small') returns Spark NLP model_anno_obj bge_small\n", + "nlu.load('en.embed_sentence.bge_large') returns Spark NLP model_anno_obj bge_large\n", + "nlu.load('en.embed_sentence.covidbert.large_uncased') returns Spark NLP model_anno_obj sent_covidbert_large_uncased\n", + "nlu.load('en.embed_sentence.distil_roberta.distilled_base') returns Spark NLP model_anno_obj sent_distilroberta_base\n", + "nlu.load('en.embed_sentence.doc2vec') returns Spark NLP model_anno_obj doc2vec_gigaword_300\n", + "nlu.load('en.embed_sentence.doc2vec.gigaword_300') returns Spark NLP model_anno_obj doc2vec_gigaword_300\n", + "nlu.load('en.embed_sentence.doc2vec.gigaword_wiki_300') returns Spark NLP model_anno_obj doc2vec_gigaword_wiki_300\n", + "nlu.load('en.embed_sentence.e5_small') returns Spark NLP model_anno_obj e5_small\n", + "nlu.load('en.embed_sentence.e5_small_opt') returns Spark NLP model_anno_obj e5_small_opt\n", + "nlu.load('en.embed_sentence.e5_small_v2_opt') returns Spark NLP model_anno_obj e5_small_v2_opt\n", + "nlu.load('en.embed_sentence.e5_base_v2') returns Spark NLP model_anno_obj e5_base_v2\n", + "nlu.load('en.embed_sentence.e5_base') returns Spark NLP model_anno_obj e5_base\n", + "nlu.load('en.embed_sentence.e5_base_v2_opt') returns Spark NLP model_anno_obj e5_base_v2_opt\n", + "nlu.load('en.embed_sentence.e5_base_quantized') returns Spark NLP model_anno_obj e5_base_quantized\n", + "nlu.load('en.embed_sentence.e5_base_opt') returns Spark NLP model_anno_obj e5_base_opt\n", + "nlu.load('en.embed_sentence.e5_base_v2_quantized') returns Spark NLP model_anno_obj e5_base_v2_quantized\n", + "nlu.load('en.embed_sentence.e5_small_v2_quantized') returns Spark NLP model_anno_obj e5_small_v2_quantized\n", + "nlu.load('en.embed_sentence.e5_large_v2') returns Spark NLP model_anno_obj e5_large_v2\n", + "nlu.load('en.embed_sentence.e5_small_v2') returns Spark NLP model_anno_obj e5_small_v2\n", + "nlu.load('en.embed_sentence.e5_small_quantized') returns Spark NLP model_anno_obj e5_small_quantized\n", + "nlu.load('en.embed_sentence.e5_large_v2_opt') returns Spark NLP model_anno_obj e5_large_v2_opt\n", + "nlu.load('en.embed_sentence.e5_large_v2_quantized') returns Spark NLP model_anno_obj e5_large_v2_quantized\n", + "nlu.load('en.embed_sentence.e5_large') returns Spark NLP model_anno_obj e5_large\n", + "nlu.load('en.embed_sentence.electra') returns Spark NLP model_anno_obj sent_electra_small_uncased\n", + "nlu.load('en.embed_sentence.electra_base_uncased') returns Spark NLP model_anno_obj sent_electra_base_uncased\n", + "nlu.load('en.embed_sentence.electra_large_uncased') returns Spark NLP model_anno_obj sent_electra_large_uncased\n", + "nlu.load('en.embed_sentence.electra_small_uncased') returns Spark NLP model_anno_obj sent_electra_small_uncased\n", + "nlu.load('en.embed_sentence.mpnet.579_stmodel_product_rem_v3a') returns Spark NLP model_anno_obj 579_stmodel_product_rem_v3a\n", + "nlu.load('en.embed_sentence.mpnet.abstract_sim_query') returns Spark NLP model_anno_obj abstract_sim_query\n", + "nlu.load('en.embed_sentence.mpnet.abstract_sim_sentence') returns Spark NLP model_anno_obj abstract_sim_sentence\n", + "nlu.load('en.embed_sentence.mpnet.action_policy_plans_classifier') returns Spark NLP model_anno_obj action_policy_plans_classifier\n", + "nlu.load('en.embed_sentence.mpnet.all_datasets_v3_mpnet_base') returns Spark NLP model_anno_obj all_datasets_v3_mpnet_base\n", + "nlu.load('en.embed_sentence.mpnet.all_datasets_v4_mpnet_base') returns Spark NLP model_anno_obj all_datasets_v4_mpnet_base\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_questions_clustering_english') returns Spark NLP model_anno_obj all_mpnet_base_questions_clustering_english\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v1') returns Spark NLP model_anno_obj all_mpnet_base_v1\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2') returns Spark NLP model_anno_obj all_mpnet_base_v2\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2_diptanuc') returns Spark NLP model_anno_obj all_mpnet_base_v2_diptanuc\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2_embedding_all') returns Spark NLP model_anno_obj all_mpnet_base_v2_embedding_all\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2_feature_extraction') returns Spark NLP model_anno_obj all_mpnet_base_v2_feature_extraction\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2_feature_extraction_pipeline') returns Spark NLP model_anno_obj all_mpnet_base_v2_feature_extraction_pipeline\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2_finetuned_v2') returns Spark NLP model_anno_obj all_mpnet_base_v2_finetuned_v2\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2_for_sb_clustering') returns Spark NLP model_anno_obj all_mpnet_base_v2_for_sb_clustering\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2_ftlegal_v3') returns Spark NLP model_anno_obj all_mpnet_base_v2_ftlegal_v3\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2_obrizum') returns Spark NLP model_anno_obj all_mpnet_base_v2_obrizum\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2_sentence_transformers') returns Spark NLP model_anno_obj all_mpnet_base_v2_sentence_transformers\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2_table') returns Spark NLP model_anno_obj all_mpnet_base_v2_table\n", + "nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2_tasky_classification') returns Spark NLP model_anno_obj all_mpnet_base_v2_tasky_classification\n", + "nlu.load('en.embed_sentence.mpnet.attack_bert') returns Spark NLP model_anno_obj attack_bert\n", + "nlu.load('en.embed_sentence.mpnet.biolord_stamb2_v1') returns Spark NLP model_anno_obj biolord_stamb2_v1\n", + "nlu.load('en.embed_sentence.mpnet.burmese_awesome_setfit_model') returns Spark NLP model_anno_obj burmese_awesome_setfit_model\n", + "nlu.load('en.embed_sentence.mpnet.burmese_awesome_setfit_model_98') returns Spark NLP model_anno_obj burmese_awesome_setfit_model_98\n", + "nlu.load('en.embed_sentence.mpnet.contradiction_psb') returns Spark NLP model_anno_obj contradiction_psb\n", + "nlu.load('en.embed_sentence.mpnet.contradiction_psb_lds') returns Spark NLP model_anno_obj contradiction_psb_lds\n", + "nlu.load('en.embed_sentence.mpnet.covid_qa_mpnet') returns Spark NLP model_anno_obj covid_qa_mpnet\n", + "nlu.load('en.embed_sentence.mpnet.cpu_conditional_classifier') returns Spark NLP model_anno_obj cpu_conditional_classifier\n", + "nlu.load('en.embed_sentence.mpnet.cpu_economywide_classifier') returns Spark NLP model_anno_obj cpu_economywide_classifier\n", + "nlu.load('en.embed_sentence.mpnet.cpu_mitigation_classifier') returns Spark NLP model_anno_obj cpu_mitigation_classifier\n", + "nlu.load('en.embed_sentence.mpnet.cpu_netzero_classifier') returns Spark NLP model_anno_obj cpu_netzero_classifier\n", + "nlu.load('en.embed_sentence.mpnet.cpu_target_classifier') returns Spark NLP model_anno_obj cpu_target_classifier\n", + "nlu.load('en.embed_sentence.mpnet.cpu_transport_ghg_classifier') returns Spark NLP model_anno_obj cpu_transport_ghg_classifier\n", + "nlu.load('en.embed_sentence.mpnet.cross_all_mpnet_base_v2_finetuned_webnlg2020_metric_average') returns Spark NLP model_anno_obj cross_all_mpnet_base_v2_finetuned_webnlg2020_metric_average\n", + "nlu.load('en.embed_sentence.mpnet.domainadaptm2') returns Spark NLP model_anno_obj domainadaptm2\n", + "nlu.load('en.embed_sentence.mpnet.due_eshop_21') returns Spark NLP model_anno_obj due_eshop_21\n", + "nlu.load('en.embed_sentence.mpnet.due_eshop_21_multilabel') returns Spark NLP model_anno_obj due_eshop_21_multilabel\n", + "nlu.load('en.embed_sentence.mpnet.due_retail_25') returns Spark NLP model_anno_obj due_retail_25\n", + "nlu.load('en.embed_sentence.mpnet.ecolo_pas_ecolo_v0.1') returns Spark NLP model_anno_obj ecolo_pas_ecolo_v0.1\n", + "nlu.load('en.embed_sentence.mpnet.esci_jp_mpnet_crossencoder') returns Spark NLP model_anno_obj esci_jp_mpnet_crossencoder\n", + "nlu.load('en.embed_sentence.mpnet.eth_setfit_payment_model') returns Spark NLP model_anno_obj eth_setfit_payment_model\n", + "nlu.load('en.embed_sentence.mpnet.fail_detect') returns Spark NLP model_anno_obj fail_detect\n", + "nlu.load('en.embed_sentence.mpnet.few_shot_model') returns Spark NLP model_anno_obj few_shot_model\n", + "nlu.load('en.embed_sentence.mpnet.fewshotissueclassifier_nlbse23') returns Spark NLP model_anno_obj fewshotissueclassifier_nlbse23\n", + "nlu.load('en.embed_sentence.mpnet.github_issues_mpnet_southern_sotho_e10') returns Spark NLP model_anno_obj github_issues_mpnet_southern_sotho_e10\n", + "nlu.load('en.embed_sentence.mpnet.github_issues_preprocessed_mpnet_southern_sotho_e10') returns Spark NLP model_anno_obj github_issues_preprocessed_mpnet_southern_sotho_e10\n", + "nlu.load('en.embed_sentence.mpnet.ikitracs_conditional') returns Spark NLP model_anno_obj ikitracs_conditional\n", + "nlu.load('en.embed_sentence.mpnet.ikitracs_mitigation') returns Spark NLP model_anno_obj ikitracs_mitigation\n", + "nlu.load('en.embed_sentence.mpnet.initial_model') returns Spark NLP model_anno_obj initial_model\n", + "nlu.load('en.embed_sentence.mpnet.initial_model_v3') returns Spark NLP model_anno_obj initial_model_v3\n", + "nlu.load('en.embed_sentence.mpnet.invoiceornot') returns Spark NLP model_anno_obj invoiceornot\n", + "nlu.load('en.embed_sentence.mpnet.java_deprecation_classifier') returns Spark NLP model_anno_obj java_deprecation_classifier\n", + "nlu.load('en.embed_sentence.mpnet.java_expand_classifier') returns Spark NLP model_anno_obj java_expand_classifier\n", + "nlu.load('en.embed_sentence.mpnet.java_ownership_classifier') returns Spark NLP model_anno_obj java_ownership_classifier\n", + "nlu.load('en.embed_sentence.mpnet.java_pointer_classifier') returns Spark NLP model_anno_obj java_pointer_classifier\n", + "nlu.load('en.embed_sentence.mpnet.java_rational_classifier') returns Spark NLP model_anno_obj java_rational_classifier\n", + "nlu.load('en.embed_sentence.mpnet.java_summary_classifier') returns Spark NLP model_anno_obj java_summary_classifier\n", + "nlu.load('en.embed_sentence.mpnet.java_usage_classifier') returns Spark NLP model_anno_obj java_usage_classifier\n", + "nlu.load('en.embed_sentence.mpnet.keyphrase_mpnet_v1') returns Spark NLP model_anno_obj keyphrase_mpnet_v1\n", + "nlu.load('en.embed_sentence.mpnet.kw_classification_setfit_model') returns Spark NLP model_anno_obj kw_classification_setfit_model\n", + "nlu.load('en.embed_sentence.mpnet.kw_classification_setfithead_model') returns Spark NLP model_anno_obj kw_classification_setfithead_model\n", + "nlu.load('en.embed_sentence.mpnet.labels_per_job_title_fine_tune') returns Spark NLP model_anno_obj labels_per_job_title_fine_tune\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_adaptation_mitigation_classifier') returns Spark NLP model_anno_obj mpnet_adaptation_mitigation_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_base_articles_ner') returns Spark NLP model_anno_obj mpnet_base_articles_ner\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_base_snli_mnli') returns Spark NLP model_anno_obj mpnet_base_snli_mnli\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_579_stmodel_product_rem_v3a') returns Spark NLP model_anno_obj mpnet_embedding_579_STmodel_product_rem_v3a\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_attack_bert') returns Spark NLP model_anno_obj mpnet_embedding_ATTACK_BERT\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_biolord_stamb2_v1') returns Spark NLP model_anno_obj mpnet_embedding_BioLORD_STAMB2_v1\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_cpu_conditional_classifier') returns Spark NLP model_anno_obj mpnet_embedding_CPU_Conditional_Classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_cpu_economywide_classifier') returns Spark NLP model_anno_obj mpnet_embedding_CPU_Economywide_Classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_cpu_mitigation_classifier') returns Spark NLP model_anno_obj mpnet_embedding_CPU_Mitigation_Classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_cpu_netzero_classifier') returns Spark NLP model_anno_obj mpnet_embedding_CPU_Netzero_Classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_cpu_target_classifier') returns Spark NLP model_anno_obj mpnet_embedding_CPU_Target_Classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_cpu_transport_ghg_classifier') returns Spark NLP model_anno_obj mpnet_embedding_CPU_Transport_GHG_Classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_domainadaptm2') returns Spark NLP model_anno_obj mpnet_embedding_DomainAdaptM2\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_fewshotissueclassifier_nlbse23') returns Spark NLP model_anno_obj mpnet_embedding_FewShotIssueClassifier_NLBSE23\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_invoiceornot') returns Spark NLP model_anno_obj mpnet_embedding_InvoiceOrNot\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_pdfsegs') returns Spark NLP model_anno_obj mpnet_embedding_PDFSegs\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_patentsberta') returns Spark NLP model_anno_obj mpnet_embedding_PatentSBERTa\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_patentsberta_v2') returns Spark NLP model_anno_obj mpnet_embedding_PatentSBERTa_V2\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_sentiment140_fewshot') returns Spark NLP model_anno_obj mpnet_embedding_Sentiment140_fewshot\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_all_data') returns Spark NLP model_anno_obj mpnet_embedding_SetFit_all_data\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_few_shot_classifier') returns Spark NLP model_anno_obj mpnet_embedding_Setfit_few_shot_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_abstract_sim_query') returns Spark NLP model_anno_obj mpnet_embedding_abstract_sim_query\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_abstract_sim_sentence') returns Spark NLP model_anno_obj mpnet_embedding_abstract_sim_sentence\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_action_policy_plans_classifier') returns Spark NLP model_anno_obj mpnet_embedding_action_policy_plans_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_datasets_v3_mpnet_base') returns Spark NLP model_anno_obj mpnet_embedding_all_datasets_v3_mpnet_base\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_datasets_v4_mpnet_base') returns Spark NLP model_anno_obj mpnet_embedding_all_datasets_v4_mpnet_base\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_questions_clustering_english') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_questions_clustering_english\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v1') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v1\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v2_by_diptanuc') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v2_by_diptanuc\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v2_by_obrizum') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v2_by_obrizum\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v2_by_sentence_transformers') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v2_by_sentence_transformers\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v2_embedding_all') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v2_embedding_all\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v2_feature_extraction') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v2_feature_extraction\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v2_feature_extraction_pipeline') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v2_feature_extraction_pipeline\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v2_finetuned_v2') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v2_finetuned_v2\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v2_for_sb_clustering') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v2_for_sb_clustering\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v2_ftlegal_v3') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v2_ftlegal_v3\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v2_table') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v2_table\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_all_mpnet_base_v2_tasky_classification') returns Spark NLP model_anno_obj mpnet_embedding_all_mpnet_base_v2_tasky_classification\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_burmese_awesome_setfit_model') returns Spark NLP model_anno_obj mpnet_embedding_burmese_awesome_setfit_model\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_burmese_awesome_setfit_model_98') returns Spark NLP model_anno_obj mpnet_embedding_burmese_awesome_setfit_model_98\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_contradiction_psb') returns Spark NLP model_anno_obj mpnet_embedding_contradiction_psb\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_contradiction_psb_lds') returns Spark NLP model_anno_obj mpnet_embedding_contradiction_psb_lds\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_covid_qa_mpnet') returns Spark NLP model_anno_obj mpnet_embedding_covid_qa_mpnet\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_cross_all_mpnet_base_v2_finetuned_webnlg2020_metric_average') returns Spark NLP model_anno_obj mpnet_embedding_cross_all_mpnet_base_v2_finetuned_WebNLG2020_metric_average\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_due_eshop_21') returns Spark NLP model_anno_obj mpnet_embedding_due_eshop_21\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_due_eshop_21_multilabel') returns Spark NLP model_anno_obj mpnet_embedding_due_eshop_21_multilabel\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_due_retail_25') returns Spark NLP model_anno_obj mpnet_embedding_due_retail_25\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_ecolo_pas_ecolo_v0.1') returns Spark NLP model_anno_obj mpnet_embedding_ecolo_pas_ecolo_v0.1\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_esci_jp_mpnet_crossencoder') returns Spark NLP model_anno_obj mpnet_embedding_esci_jp_mpnet_crossencoder\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_eth_setfit_payment_model') returns Spark NLP model_anno_obj mpnet_embedding_eth_setfit_payment_model\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_fail_detect') returns Spark NLP model_anno_obj mpnet_embedding_fail_detect\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_few_shot_model') returns Spark NLP model_anno_obj mpnet_embedding_few_shot_model\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_finetunned_sbert') returns Spark NLP model_anno_obj mpnet_embedding_finetunned_sbert\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_github_issues_mpnet_southern_sotho_e10') returns Spark NLP model_anno_obj mpnet_embedding_github_issues_mpnet_southern_sotho_e10\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_github_issues_mpnet_st_e10') returns Spark NLP model_anno_obj mpnet_embedding_github_issues_mpnet_st_e10\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_github_issues_preprocessed_mpnet_southern_sotho_e10') returns Spark NLP model_anno_obj mpnet_embedding_github_issues_preprocessed_mpnet_southern_sotho_e10\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_github_issues_preprocessed_mpnet_st_e10') returns Spark NLP model_anno_obj mpnet_embedding_github_issues_preprocessed_mpnet_st_e10\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_ikitracs_conditional') returns Spark NLP model_anno_obj mpnet_embedding_ikitracs_conditional\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_ikitracs_mitigation') returns Spark NLP model_anno_obj mpnet_embedding_ikitracs_mitigation\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_initial_model') returns Spark NLP model_anno_obj mpnet_embedding_initial_model\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_initial_model_v3') returns Spark NLP model_anno_obj mpnet_embedding_initial_model_v3\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_java_deprecation_classifier') returns Spark NLP model_anno_obj mpnet_embedding_java_deprecation_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_java_expand_classifier') returns Spark NLP model_anno_obj mpnet_embedding_java_expand_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_java_ownership_classifier') returns Spark NLP model_anno_obj mpnet_embedding_java_ownership_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_java_pointer_classifier') returns Spark NLP model_anno_obj mpnet_embedding_java_pointer_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_java_rational_classifier') returns Spark NLP model_anno_obj mpnet_embedding_java_rational_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_java_summary_classifier') returns Spark NLP model_anno_obj mpnet_embedding_java_summary_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_java_usage_classifier') returns Spark NLP model_anno_obj mpnet_embedding_java_usage_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_keyphrase_mpnet_v1') returns Spark NLP model_anno_obj mpnet_embedding_keyphrase_mpnet_v1\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_kw_classification_setfit_model') returns Spark NLP model_anno_obj mpnet_embedding_kw_classification_setfit_model\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_kw_classification_setfithead_model') returns Spark NLP model_anno_obj mpnet_embedding_kw_classification_setfithead_model\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_labels_per_job_title_fine_tune') returns Spark NLP model_anno_obj mpnet_embedding_labels_per_job_title_fine_tune\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_mpnet_adaptation_mitigation_classifier') returns Spark NLP model_anno_obj mpnet_embedding_mpnet_adaptation_mitigation_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_mpnet_base') returns Spark NLP model_anno_obj mpnet_embedding_mpnet_base\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_mpnet_base_articles_ner') returns Spark NLP model_anno_obj mpnet_embedding_mpnet_base_articles_ner\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_mpnet_base_snli_mnli') returns Spark NLP model_anno_obj mpnet_embedding_mpnet_base_snli_mnli\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_mpnet_mnr_v2_fine_tuned') returns Spark NLP model_anno_obj mpnet_embedding_mpnet_mnr_v2_fine_tuned\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_mpnet_multilabel_sector_classifier') returns Spark NLP model_anno_obj mpnet_embedding_mpnet_multilabel_sector_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_mpnet_nli_sts') returns Spark NLP model_anno_obj mpnet_embedding_mpnet_nli_sts\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_mpnet_retriever_squad2') returns Spark NLP model_anno_obj mpnet_embedding_mpnet_retriever_squad2\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_mpnet_snli') returns Spark NLP model_anno_obj mpnet_embedding_mpnet_snli\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_mpnet_snli_negatives') returns Spark NLP model_anno_obj mpnet_embedding_mpnet_snli_negatives\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_multi_qa_v1_mpnet_asymmetric_a') returns Spark NLP model_anno_obj mpnet_embedding_multi_QA_v1_mpnet_asymmetric_A\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_multi_qa_v1_mpnet_asymmetric_q') returns Spark NLP model_anno_obj mpnet_embedding_multi_QA_v1_mpnet_asymmetric_Q\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_multi_qa_mpnet_base_cos_v1') returns Spark NLP model_anno_obj mpnet_embedding_multi_qa_mpnet_base_cos_v1\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_multi_qa_mpnet_base_cos_v1_by_navteca') returns Spark NLP model_anno_obj mpnet_embedding_multi_qa_mpnet_base_cos_v1_by_navteca\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_multi_qa_mpnet_base_cos_v1_by_sentence_transformers') returns Spark NLP model_anno_obj mpnet_embedding_multi_qa_mpnet_base_cos_v1_by_sentence_transformers\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_multi_qa_mpnet_base_dot_v1') returns Spark NLP model_anno_obj mpnet_embedding_multi_qa_mpnet_base_dot_v1\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_multi_qa_mpnet_base_dot_v1_by_model_embeddings') returns Spark NLP model_anno_obj mpnet_embedding_multi_qa_mpnet_base_dot_v1_by_model_embeddings\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_multi_qa_mpnet_base_dot_v1_by_sentence_transformers') returns Spark NLP model_anno_obj mpnet_embedding_multi_qa_mpnet_base_dot_v1_by_sentence_transformers\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_multi_qa_mpnet_base_dot_v1_eclass') returns Spark NLP model_anno_obj mpnet_embedding_multi_qa_mpnet_base_dot_v1_eclass\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_multi_qa_mpnet_base_dot_v1_legal_finetune') returns Spark NLP model_anno_obj mpnet_embedding_multi_qa_mpnet_base_dot_v1_legal_finetune\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_multi_qa_v1_mpnet_cls_dot') returns Spark NLP model_anno_obj mpnet_embedding_multi_qa_v1_mpnet_cls_dot\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_my_awesome_setfit_model_98') returns Spark NLP model_anno_obj mpnet_embedding_my_awesome_setfit_model_98\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_nli_mpnet_base_v2') returns Spark NLP model_anno_obj mpnet_embedding_nli_mpnet_base_v2\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_nli_mpnet_base_v2_by_sentence_transformers') returns Spark NLP model_anno_obj mpnet_embedding_nli_mpnet_base_v2_by_sentence_transformers\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_nooks_amd_detection_realtime') returns Spark NLP model_anno_obj mpnet_embedding_nooks_amd_detection_realtime\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_nooks_amd_detection_v2_full') returns Spark NLP model_anno_obj mpnet_embedding_nooks_amd_detection_v2_full\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_nps_psb_lds') returns Spark NLP model_anno_obj mpnet_embedding_nps_psb_lds\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_ouvrage_classif') returns Spark NLP model_anno_obj mpnet_embedding_ouvrage_classif\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_paraphrase_mpnet_base_v2') returns Spark NLP model_anno_obj mpnet_embedding_paraphrase_mpnet_base_v2\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_paraphrase_mpnet_base_v2_setfit_sst2') returns Spark NLP model_anno_obj mpnet_embedding_paraphrase_mpnet_base_v2_SetFit_sst2\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_paraphrase_mpnet_base_v2_by_sentence_transformers') returns Spark NLP model_anno_obj mpnet_embedding_paraphrase_mpnet_base_v2_by_sentence_transformers\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_paraphrase_mpnet_base_v2_finetuned_polifact') returns Spark NLP model_anno_obj mpnet_embedding_paraphrase_mpnet_base_v2_finetuned_polifact\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_paraphrase_mpnet_base_v2_fuzzy_matcher') returns Spark NLP model_anno_obj mpnet_embedding_paraphrase_mpnet_base_v2_fuzzy_matcher\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_pharo_collaborators_classifier') returns Spark NLP model_anno_obj mpnet_embedding_pharo_collaborators_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_pharo_example_classifier') returns Spark NLP model_anno_obj mpnet_embedding_pharo_example_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_pharo_keyimplementationpoints_classifier') returns Spark NLP model_anno_obj mpnet_embedding_pharo_keyimplementationpoints_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_pharo_responsibilities_classifier') returns Spark NLP model_anno_obj mpnet_embedding_pharo_responsibilities_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_python_developmentnotes_classifier') returns Spark NLP model_anno_obj mpnet_embedding_python_developmentnotes_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_python_expand_classifier') returns Spark NLP model_anno_obj mpnet_embedding_python_expand_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_python_parameters_classifier') returns Spark NLP model_anno_obj mpnet_embedding_python_parameters_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_python_summary_classifier') returns Spark NLP model_anno_obj mpnet_embedding_python_summary_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_python_usage_classifier') returns Spark NLP model_anno_obj mpnet_embedding_python_usage_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_reddit_single_context_mpnet_base') returns Spark NLP model_anno_obj mpnet_embedding_reddit_single_context_mpnet_base\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_retriever_coding_guru_adapted') returns Spark NLP model_anno_obj mpnet_embedding_retriever_coding_guru_adapted\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_review_intent_20230116') returns Spark NLP model_anno_obj mpnet_embedding_review_intent_20230116\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_review_multiclass_20230116') returns Spark NLP model_anno_obj mpnet_embedding_review_multiclass_20230116\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_sb_temfac') returns Spark NLP model_anno_obj mpnet_embedding_sb_temfac\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_sbert_paper') returns Spark NLP model_anno_obj mpnet_embedding_sbert_paper\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_sentence_transformers_bible_reference_final') returns Spark NLP model_anno_obj mpnet_embedding_sentence_transformers_bible_reference_final\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_ag_news_endpoint') returns Spark NLP model_anno_obj mpnet_embedding_setfit_ag_news_endpoint\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_ds_version_0_0_1') returns Spark NLP model_anno_obj mpnet_embedding_setfit_ds_version_0_0_1\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_ds_version_0_0_2') returns Spark NLP model_anno_obj mpnet_embedding_setfit_ds_version_0_0_2\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_ds_version_0_0_4') returns Spark NLP model_anno_obj mpnet_embedding_setfit_ds_version_0_0_4\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_ds_version_0_0_5') returns Spark NLP model_anno_obj mpnet_embedding_setfit_ds_version_0_0_5\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_ethos_multilabel_example') returns Spark NLP model_anno_obj mpnet_embedding_setfit_ethos_multilabel_example\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_ethos_multilabel_example_by_lewtun') returns Spark NLP model_anno_obj mpnet_embedding_setfit_ethos_multilabel_example_by_lewtun\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_ethos_multilabel_example_by_neilthematic') returns Spark NLP model_anno_obj mpnet_embedding_setfit_ethos_multilabel_example_by_neilthematic\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_finetuned_financial_text') returns Spark NLP model_anno_obj mpnet_embedding_setfit_finetuned_financial_text\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_ft_sentinent_eval') returns Spark NLP model_anno_obj mpnet_embedding_setfit_ft_sentinent_eval\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_model') returns Spark NLP model_anno_obj mpnet_embedding_setfit_model\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_model_feb11_misinformation_on_law') returns Spark NLP model_anno_obj mpnet_embedding_setfit_model_Feb11_Misinformation_on_Law\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_model_by_pradipta11') returns Spark NLP model_anno_obj mpnet_embedding_setfit_model_by_pradipta11\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_model_by_rajistics') returns Spark NLP model_anno_obj mpnet_embedding_setfit_model_by_rajistics\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_model_test_sensitve_v1') returns Spark NLP model_anno_obj mpnet_embedding_setfit_model_test_sensitve_v1\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_occupation') returns Spark NLP model_anno_obj mpnet_embedding_setfit_occupation\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_ostrom') returns Spark NLP model_anno_obj mpnet_embedding_setfit_ostrom\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p1') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p1\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p1_comm') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p1_comm\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p1_life') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p1_life\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p1_likes') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p1_likes\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_bhvr') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_bhvr\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_cons') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_cons\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_dur') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_dur\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_func') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_func\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_sev') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_sev\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_trig') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p3_trig\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p4_achiev') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p4_achiev\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p4_meas') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p4_meas\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p4_rel') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p4_rel\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p4_specific') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p4_specific\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_p4_time') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_p4_time\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_setfit_zero_shot_classification_pbsp_q8a_azure_gpt35') returns Spark NLP model_anno_obj mpnet_embedding_setfit_zero_shot_classification_pbsp_q8a_azure_gpt35\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_shona_mpnet_base_snli_mnli') returns Spark NLP model_anno_obj mpnet_embedding_shona_mpnet_base_snli_mnli\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_sml_ukr_message_classifier') returns Spark NLP model_anno_obj mpnet_embedding_sml_ukr_message_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_sml_ukr_word_classifier_medium') returns Spark NLP model_anno_obj mpnet_embedding_sml_ukr_word_classifier_medium\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_sn_mpnet_base_snli_mnli') returns Spark NLP model_anno_obj mpnet_embedding_sn_mpnet_base_snli_mnli\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_spiced') returns Spark NLP model_anno_obj mpnet_embedding_spiced\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_stackoverflow_mpnet_base') returns Spark NLP model_anno_obj mpnet_embedding_stackoverflow_mpnet_base\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_stsb_mpnet_base_v2') returns Spark NLP model_anno_obj mpnet_embedding_stsb_mpnet_base_v2\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_test_food') returns Spark NLP model_anno_obj mpnet_embedding_test_food\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_test_model_001') returns Spark NLP model_anno_obj mpnet_embedding_test_model_001\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetformaskedlm') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_MPNetForMaskedLM\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetforquestionanswering') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_MPNetForQuestionAnswering\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetforsequenceclassification') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_MPNetForSequenceClassification\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetfortokenclassification') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_MPNetForTokenClassification\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetmodel') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_MPNetModel\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnet_by_hf_internal_testing') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_mpnet_by_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetformaskedlm_by_hf_internal_testing') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_mpnetformaskedlm_by_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetformaskedlm_by_hf_tiny_model_private') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_mpnetformaskedlm_by_hf_tiny_model_private\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetforquestionanswering_by_hf_internal_testing') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_mpnetforquestionanswering_by_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetforquestionanswering_by_hf_tiny_model_private') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_mpnetforquestionanswering_by_hf_tiny_model_private\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetforsequenceclassification_by_hf_internal_testing') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_mpnetforsequenceclassification_by_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetforsequenceclassification_by_hf_tiny_model_private') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_mpnetforsequenceclassification_by_hf_tiny_model_private\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetfortokenclassification_by_hf_internal_testing') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_mpnetfortokenclassification_by_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetfortokenclassification_by_hf_tiny_model_private') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_mpnetfortokenclassification_by_hf_tiny_model_private\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetmodel_by_hf_internal_testing') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_mpnetmodel_by_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_tiny_random_mpnetmodel_by_hf_tiny_model_private') returns Spark NLP model_anno_obj mpnet_embedding_tiny_random_mpnetmodel_by_hf_tiny_model_private\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_embedding_vulnerable_groups') returns Spark NLP model_anno_obj mpnet_embedding_vulnerable_groups\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_mnr_v2_fine_tuned') returns Spark NLP model_anno_obj mpnet_mnr_v2_fine_tuned\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_multilabel_sector_classifier') returns Spark NLP model_anno_obj mpnet_multilabel_sector_classifier\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_nli_sts') returns Spark NLP model_anno_obj mpnet_nli_sts\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_retriever_squad2') returns Spark NLP model_anno_obj mpnet_retriever_squad2\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_snli') returns Spark NLP model_anno_obj mpnet_snli\n", + "nlu.load('en.embed_sentence.mpnet.mpnet_snli_negatives') returns Spark NLP model_anno_obj mpnet_snli_negatives\n", + "nlu.load('en.embed_sentence.mpnet.multi_qa_mpnet_base_cos_v1') returns Spark NLP model_anno_obj multi_qa_mpnet_base_cos_v1\n", + "nlu.load('en.embed_sentence.mpnet.multi_qa_mpnet_base_cos_v1_navteca') returns Spark NLP model_anno_obj multi_qa_mpnet_base_cos_v1_navteca\n", + "nlu.load('en.embed_sentence.mpnet.multi_qa_mpnet_base_cos_v1_sentence_transformers') returns Spark NLP model_anno_obj multi_qa_mpnet_base_cos_v1_sentence_transformers\n", + "nlu.load('en.embed_sentence.mpnet.multi_qa_mpnet_base_dot_v1') returns Spark NLP model_anno_obj multi_qa_mpnet_base_dot_v1\n", + "nlu.load('en.embed_sentence.mpnet.multi_qa_mpnet_base_dot_v1_eclass') returns Spark NLP model_anno_obj multi_qa_mpnet_base_dot_v1_eclass\n", + "nlu.load('en.embed_sentence.mpnet.multi_qa_mpnet_base_dot_v1_legal_finetune') returns Spark NLP model_anno_obj multi_qa_mpnet_base_dot_v1_legal_finetune\n", + "nlu.load('en.embed_sentence.mpnet.multi_qa_mpnet_base_dot_v1_model_embeddings') returns Spark NLP model_anno_obj multi_qa_mpnet_base_dot_v1_model_embeddings\n", + "nlu.load('en.embed_sentence.mpnet.multi_qa_mpnet_base_dot_v1_sentence_transformers') returns Spark NLP model_anno_obj multi_qa_mpnet_base_dot_v1_sentence_transformers\n", + "nlu.load('en.embed_sentence.mpnet.multi_qa_v1_mpnet_asymmetric_a') returns Spark NLP model_anno_obj multi_qa_v1_mpnet_asymmetric_a\n", + "nlu.load('en.embed_sentence.mpnet.multi_qa_v1_mpnet_asymmetric_q') returns Spark NLP model_anno_obj multi_qa_v1_mpnet_asymmetric_q\n", + "nlu.load('en.embed_sentence.mpnet.multi_qa_v1_mpnet_cls_dot') returns Spark NLP model_anno_obj multi_qa_v1_mpnet_cls_dot\n", + "nlu.load('en.embed_sentence.mpnet.nli_mpnet_base_v2') returns Spark NLP model_anno_obj nli_mpnet_base_v2\n", + "nlu.load('en.embed_sentence.mpnet.nli_mpnet_base_v2_sentence_transformers') returns Spark NLP model_anno_obj nli_mpnet_base_v2_sentence_transformers\n", + "nlu.load('en.embed_sentence.mpnet.nooks_amd_detection_realtime') returns Spark NLP model_anno_obj nooks_amd_detection_realtime\n", + "nlu.load('en.embed_sentence.mpnet.nooks_amd_detection_v2_full') returns Spark NLP model_anno_obj nooks_amd_detection_v2_full\n", + "nlu.load('en.embed_sentence.mpnet.nps_psb_lds') returns Spark NLP model_anno_obj nps_psb_lds\n", + "nlu.load('en.embed_sentence.mpnet.ouvrage_classif') returns Spark NLP model_anno_obj ouvrage_classif\n", + "nlu.load('en.embed_sentence.mpnet.paraphrase_mpnet_base_v2') returns Spark NLP model_anno_obj paraphrase_mpnet_base_v2\n", + "nlu.load('en.embed_sentence.mpnet.paraphrase_mpnet_base_v2_finetuned_polifact') returns Spark NLP model_anno_obj paraphrase_mpnet_base_v2_finetuned_polifact\n", + "nlu.load('en.embed_sentence.mpnet.paraphrase_mpnet_base_v2_fuzzy_matcher') returns Spark NLP model_anno_obj paraphrase_mpnet_base_v2_fuzzy_matcher\n", + "nlu.load('en.embed_sentence.mpnet.paraphrase_mpnet_base_v2_sentence_transformers') returns Spark NLP model_anno_obj paraphrase_mpnet_base_v2_sentence_transformers\n", + "nlu.load('en.embed_sentence.mpnet.paraphrase_mpnet_base_v2_setfit_sst2') returns Spark NLP model_anno_obj paraphrase_mpnet_base_v2_setfit_sst2\n", + "nlu.load('en.embed_sentence.mpnet.patentsberta') returns Spark NLP model_anno_obj patentsberta\n", + "nlu.load('en.embed_sentence.mpnet.patentsberta_v2') returns Spark NLP model_anno_obj patentsberta_v2\n", + "nlu.load('en.embed_sentence.mpnet.pdfsegs') returns Spark NLP model_anno_obj pdfsegs\n", + "nlu.load('en.embed_sentence.mpnet.pharo_collaborators_classifier') returns Spark NLP model_anno_obj pharo_collaborators_classifier\n", + "nlu.load('en.embed_sentence.mpnet.pharo_example_classifier') returns Spark NLP model_anno_obj pharo_example_classifier\n", + "nlu.load('en.embed_sentence.mpnet.pharo_keyimplementationpoints_classifier') returns Spark NLP model_anno_obj pharo_keyimplementationpoints_classifier\n", + "nlu.load('en.embed_sentence.mpnet.pharo_responsibilities_classifier') returns Spark NLP model_anno_obj pharo_responsibilities_classifier\n", + "nlu.load('en.embed_sentence.mpnet.python_developmentnotes_classifier') returns Spark NLP model_anno_obj python_developmentnotes_classifier\n", + "nlu.load('en.embed_sentence.mpnet.python_expand_classifier') returns Spark NLP model_anno_obj python_expand_classifier\n", + "nlu.load('en.embed_sentence.mpnet.python_parameters_classifier') returns Spark NLP model_anno_obj python_parameters_classifier\n", + "nlu.load('en.embed_sentence.mpnet.python_summary_classifier') returns Spark NLP model_anno_obj python_summary_classifier\n", + "nlu.load('en.embed_sentence.mpnet.python_usage_classifier') returns Spark NLP model_anno_obj python_usage_classifier\n", + "nlu.load('en.embed_sentence.mpnet.reddit_single_context_mpnet_base') returns Spark NLP model_anno_obj reddit_single_context_mpnet_base\n", + "nlu.load('en.embed_sentence.mpnet.retriever_coding_guru_adapted') returns Spark NLP model_anno_obj retriever_coding_guru_adapted\n", + "nlu.load('en.embed_sentence.mpnet.review_intent_20230116') returns Spark NLP model_anno_obj review_intent_20230116\n", + "nlu.load('en.embed_sentence.mpnet.review_multiclass_20230116') returns Spark NLP model_anno_obj review_multiclass_20230116\n", + "nlu.load('en.embed_sentence.mpnet.sb_temfac') returns Spark NLP model_anno_obj sb_temfac\n", + "nlu.load('en.embed_sentence.mpnet.sbert_paper') returns Spark NLP model_anno_obj sbert_paper\n", + "nlu.load('en.embed_sentence.mpnet.sentence_transformers_bible_reference_final') returns Spark NLP model_anno_obj sentence_transformers_bible_reference_final\n", + "nlu.load('en.embed_sentence.mpnet.sentiment140_fewshot') returns Spark NLP model_anno_obj sentiment140_fewshot\n", + "nlu.load('en.embed_sentence.mpnet.setfit_ag_news_endpoint') returns Spark NLP model_anno_obj setfit_ag_news_endpoint\n", + "nlu.load('en.embed_sentence.mpnet.setfit_all_data') returns Spark NLP model_anno_obj setfit_all_data\n", + "nlu.load('en.embed_sentence.mpnet.setfit_ds_version_0_0_1') returns Spark NLP model_anno_obj setfit_ds_version_0_0_1\n", + "nlu.load('en.embed_sentence.mpnet.setfit_ds_version_0_0_2') returns Spark NLP model_anno_obj setfit_ds_version_0_0_2\n", + "nlu.load('en.embed_sentence.mpnet.setfit_ds_version_0_0_4') returns Spark NLP model_anno_obj setfit_ds_version_0_0_4\n", + "nlu.load('en.embed_sentence.mpnet.setfit_ds_version_0_0_5') returns Spark NLP model_anno_obj setfit_ds_version_0_0_5\n", + "nlu.load('en.embed_sentence.mpnet.setfit_ethos_multilabel_example_lewtun') returns Spark NLP model_anno_obj setfit_ethos_multilabel_example_lewtun\n", + "nlu.load('en.embed_sentence.mpnet.setfit_ethos_multilabel_example_neilthematic') returns Spark NLP model_anno_obj setfit_ethos_multilabel_example_neilthematic\n", + "nlu.load('en.embed_sentence.mpnet.setfit_few_shot_classifier') returns Spark NLP model_anno_obj setfit_few_shot_classifier\n", + "nlu.load('en.embed_sentence.mpnet.setfit_finetuned_financial_text') returns Spark NLP model_anno_obj setfit_finetuned_financial_text\n", + "nlu.load('en.embed_sentence.mpnet.setfit_ft_sentinent_eval') returns Spark NLP model_anno_obj setfit_ft_sentinent_eval\n", + "nlu.load('en.embed_sentence.mpnet.setfit_model_feb11_misinformation_on_law') returns Spark NLP model_anno_obj setfit_model_feb11_misinformation_on_law\n", + "nlu.load('en.embed_sentence.mpnet.setfit_model_pradipta11') returns Spark NLP model_anno_obj setfit_model_pradipta11\n", + "nlu.load('en.embed_sentence.mpnet.setfit_model_rajistics') returns Spark NLP model_anno_obj setfit_model_rajistics\n", + "nlu.load('en.embed_sentence.mpnet.setfit_model_test_sensitve_v1') returns Spark NLP model_anno_obj setfit_model_test_sensitve_v1\n", + "nlu.load('en.embed_sentence.mpnet.setfit_occupation') returns Spark NLP model_anno_obj setfit_occupation\n", + "nlu.load('en.embed_sentence.mpnet.setfit_ostrom') returns Spark NLP model_anno_obj setfit_ostrom\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p1') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p1\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p1_comm') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p1_comm\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p1_life') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p1_life\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p1_likes') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p1_likes\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p3_bhvr') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p3_bhvr\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p3_cons') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p3_cons\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p3_dur') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p3_dur\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p3_func') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p3_func\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p3_sev') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p3_sev\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p3_trig') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p3_trig\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p4_achiev') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p4_achiev\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p4_meas') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p4_meas\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p4_rel') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p4_rel\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p4_specific') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p4_specific\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_p4_time') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_p4_time\n", + "nlu.load('en.embed_sentence.mpnet.setfit_zero_shot_classification_pbsp_q8a_azure_gpt35') returns Spark NLP model_anno_obj setfit_zero_shot_classification_pbsp_q8a_azure_gpt35\n", + "nlu.load('en.embed_sentence.mpnet.shona_mpnet_base_snli_mnli') returns Spark NLP model_anno_obj shona_mpnet_base_snli_mnli\n", + "nlu.load('en.embed_sentence.mpnet.sml_ukr_message_classifier') returns Spark NLP model_anno_obj sml_ukr_message_classifier\n", + "nlu.load('en.embed_sentence.mpnet.sml_ukr_word_classifier_medium') returns Spark NLP model_anno_obj sml_ukr_word_classifier_medium\n", + "nlu.load('en.embed_sentence.mpnet.spiced') returns Spark NLP model_anno_obj spiced\n", + "nlu.load('en.embed_sentence.mpnet.stackoverflow_mpnet_base') returns Spark NLP model_anno_obj stackoverflow_mpnet_base\n", + "nlu.load('en.embed_sentence.mpnet.stsb_mpnet_base_v2') returns Spark NLP model_anno_obj stsb_mpnet_base_v2\n", + "nlu.load('en.embed_sentence.mpnet.test_food') returns Spark NLP model_anno_obj test_food\n", + "nlu.load('en.embed_sentence.mpnet.test_model_001') returns Spark NLP model_anno_obj test_model_001\n", + "nlu.load('en.embed_sentence.mpnet.testing_setfit') returns Spark NLP model_anno_obj testing_setfit\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnet_hf_internal_testing') returns Spark NLP model_anno_obj tiny_random_mpnet_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnetformaskedlm_hf_internal_testing') returns Spark NLP model_anno_obj tiny_random_mpnetformaskedlm_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnetformaskedlm_hf_tiny_model_private') returns Spark NLP model_anno_obj tiny_random_mpnetformaskedlm_hf_tiny_model_private\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnetformultiplechoice') returns Spark NLP model_anno_obj tiny_random_mpnetformultiplechoice\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnetforquestionanswering_hf_internal_testing') returns Spark NLP model_anno_obj tiny_random_mpnetforquestionanswering_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnetforquestionanswering_hf_tiny_model_private') returns Spark NLP model_anno_obj tiny_random_mpnetforquestionanswering_hf_tiny_model_private\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnetforsequenceclassification_hf_internal_testing') returns Spark NLP model_anno_obj tiny_random_mpnetforsequenceclassification_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnetforsequenceclassification_hf_tiny_model_private') returns Spark NLP model_anno_obj tiny_random_mpnetforsequenceclassification_hf_tiny_model_private\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnetfortokenclassification_hf_internal_testing') returns Spark NLP model_anno_obj tiny_random_mpnetfortokenclassification_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnetfortokenclassification_hf_tiny_model_private') returns Spark NLP model_anno_obj tiny_random_mpnetfortokenclassification_hf_tiny_model_private\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnetmodel_hf_internal_testing') returns Spark NLP model_anno_obj tiny_random_mpnetmodel_hf_internal_testing\n", + "nlu.load('en.embed_sentence.mpnet.tiny_random_mpnetmodel_hf_tiny_model_private') returns Spark NLP model_anno_obj tiny_random_mpnetmodel_hf_tiny_model_private\n", + "nlu.load('en.embed_sentence.mpnet.vulnerable_groups') returns Spark NLP model_anno_obj vulnerable_groups\n", + "nlu.load('en.embed_sentence.roberta.base') returns Spark NLP model_anno_obj sent_roberta_base\n", + "nlu.load('en.embed_sentence.roberta.large') returns Spark NLP model_anno_obj sent_roberta_large\n", + "nlu.load('en.embed_sentence.small_bert_L10_128') returns Spark NLP model_anno_obj sent_small_bert_L10_128\n", + "nlu.load('en.embed_sentence.small_bert_L10_256') returns Spark NLP model_anno_obj sent_small_bert_L10_256\n", + "nlu.load('en.embed_sentence.small_bert_L10_512') returns Spark NLP model_anno_obj sent_small_bert_L10_512\n", + "nlu.load('en.embed_sentence.small_bert_L10_768') returns Spark NLP model_anno_obj sent_small_bert_L10_768\n", + "nlu.load('en.embed_sentence.small_bert_L12_128') returns Spark NLP model_anno_obj sent_small_bert_L12_128\n", + "nlu.load('en.embed_sentence.small_bert_L12_256') returns Spark NLP model_anno_obj sent_small_bert_L12_256\n", + "nlu.load('en.embed_sentence.small_bert_L12_512') returns Spark NLP model_anno_obj sent_small_bert_L12_512\n", + "nlu.load('en.embed_sentence.small_bert_L12_768') returns Spark NLP model_anno_obj sent_small_bert_L12_768\n", + "nlu.load('en.embed_sentence.small_bert_L2_128') returns Spark NLP model_anno_obj sent_small_bert_L2_128\n", + "nlu.load('en.embed_sentence.small_bert_L2_256') returns Spark NLP model_anno_obj sent_small_bert_L2_256\n", + "nlu.load('en.embed_sentence.small_bert_L2_512') returns Spark NLP model_anno_obj sent_small_bert_L2_512\n", + "nlu.load('en.embed_sentence.small_bert_L2_768') returns Spark NLP model_anno_obj sent_small_bert_L2_768\n", + "nlu.load('en.embed_sentence.small_bert_L4_128') returns Spark NLP model_anno_obj sent_small_bert_L4_128\n", + "nlu.load('en.embed_sentence.small_bert_L4_256') returns Spark NLP model_anno_obj sent_small_bert_L4_256\n", + "nlu.load('en.embed_sentence.small_bert_L4_512') returns Spark NLP model_anno_obj sent_small_bert_L4_512\n", + "nlu.load('en.embed_sentence.small_bert_L4_768') returns Spark NLP model_anno_obj sent_small_bert_L4_768\n", + "nlu.load('en.embed_sentence.small_bert_L6_128') returns Spark NLP model_anno_obj sent_small_bert_L6_128\n", + "nlu.load('en.embed_sentence.small_bert_L6_256') returns Spark NLP model_anno_obj sent_small_bert_L6_256\n", + "nlu.load('en.embed_sentence.small_bert_L6_512') returns Spark NLP model_anno_obj sent_small_bert_L6_512\n", + "nlu.load('en.embed_sentence.small_bert_L6_768') returns Spark NLP model_anno_obj sent_small_bert_L6_768\n", + "nlu.load('en.embed_sentence.small_bert_L8_128') returns Spark NLP model_anno_obj sent_small_bert_L8_128\n", + "nlu.load('en.embed_sentence.small_bert_L8_256') returns Spark NLP model_anno_obj sent_small_bert_L8_256\n", + "nlu.load('en.embed_sentence.small_bert_L8_512') returns Spark NLP model_anno_obj sent_small_bert_L8_512\n", + "nlu.load('en.embed_sentence.small_bert_L8_768') returns Spark NLP model_anno_obj sent_small_bert_L8_768\n", + "nlu.load('en.embed_sentence.instructor_base') returns Spark NLP model_anno_obj instructor_base\n", + "nlu.load('en.embed_sentence.instructor_large') returns Spark NLP model_anno_obj instructor_large\n", + "nlu.load('en.embed_sentence.tfhub_use') returns Spark NLP model_anno_obj tfhub_use\n", + "nlu.load('en.embed_sentence.tfhub_use.lg') returns Spark NLP model_anno_obj tfhub_use_lg\n", + "nlu.load('en.embed_sentence.use') returns Spark NLP model_anno_obj tfhub_use\n", + "nlu.load('en.embed_sentence.use.lg') returns Spark NLP model_anno_obj tfhub_use_lg\n", + "For language NLU provides the following Models : \n", + "nlu.load('es.embed_sentence.bert.base_cased') returns Spark NLP model_anno_obj sent_bert_base_cased\n", + "nlu.load('es.embed_sentence.bert.base_uncased') returns Spark NLP model_anno_obj sent_bert_base_uncased\n", + "nlu.load('es.embed_sentence.mpnet.mpnet_embedding_negation_categories_classifier') returns Spark NLP model_anno_obj mpnet_embedding_negation_categories_classifier\n", + "nlu.load('es.embed_sentence.mpnet.mpnet_embedding_setfit_alpaca_es_unprocessable_sample_detection') returns Spark NLP model_anno_obj mpnet_embedding_setfit_alpaca_es_unprocessable_sample_detection\n", + "nlu.load('es.embed_sentence.mpnet.mpnet_embedding_setfit_alpaca_spanish_unprocessable_sample_detection') returns Spark NLP model_anno_obj mpnet_embedding_setfit_alpaca_spanish_unprocessable_sample_detection\n", + "nlu.load('es.embed_sentence.mpnet.negation_categories_classifier') returns Spark NLP model_anno_obj negation_categories_classifier\n", + "nlu.load('es.embed_sentence.mpnet.setfit_alpaca_spanish_unprocessable_sample_detection') returns Spark NLP model_anno_obj setfit_alpaca_spanish_unprocessable_sample_detection\n", + "For language NLU provides the following Models : \n", + "nlu.load('fi.embed_sentence.bert') returns Spark NLP model_anno_obj bert_base_finnish_uncased\n", + "nlu.load('fi.embed_sentence.bert.cased') returns Spark NLP model_anno_obj bert_base_finnish_cased\n", + "nlu.load('fi.embed_sentence.bert.uncased') returns Spark NLP model_anno_obj bert_base_finnish_uncased\n", + "For language NLU provides the following Models : \n", + "nlu.load('fr.embed_sentence.mpnet.biencoder_all_mpnet_base_v2_mmarcofr') returns Spark NLP model_anno_obj biencoder_all_mpnet_base_v2_mmarcofr\n", + "nlu.load('fr.embed_sentence.mpnet.biencoder_multi_qa_mpnet_base_cos_v1_mmarcofr') returns Spark NLP model_anno_obj biencoder_multi_qa_mpnet_base_cos_v1_mmarcofr\n", + "nlu.load('fr.embed_sentence.mpnet.mpnet_embedding_biencoder_all_mpnet_base_v2_mmarcofr') returns Spark NLP model_anno_obj mpnet_embedding_biencoder_all_mpnet_base_v2_mmarcoFR\n", + "nlu.load('fr.embed_sentence.mpnet.mpnet_embedding_biencoder_multi_qa_mpnet_base_cos_v1_mmarcofr') returns Spark NLP model_anno_obj mpnet_embedding_biencoder_multi_qa_mpnet_base_cos_v1_mmarcoFR\n", + "For language NLU provides the following Models : \n", + "nlu.load('ha.embed_sentence.xlm_roberta') returns Spark NLP model_anno_obj sent_xlm_roberta_base_finetuned_hausa\n", + "For language NLU provides the following Models : \n", + "nlu.load('ig.embed_sentence.xlm_roberta') returns Spark NLP model_anno_obj sent_xlm_roberta_base_finetuned_igbo\n", + "For language NLU provides the following Models : \n", + "nlu.load('lg.embed_sentence.xlm_roberta') returns Spark NLP model_anno_obj sent_xlm_roberta_base_finetuned_luganda\n", + "For language NLU provides the following Models : \n", + "nlu.load('nl.embed_sentence.bert.base_cased') returns Spark NLP model_anno_obj sent_bert_base_cased\n", + "For language NLU provides the following Models : \n", + "nlu.load('pcm.embed_sentence.xlm_roberta') returns Spark NLP model_anno_obj sent_xlm_roberta_base_finetuned_naija\n", + "For language NLU provides the following Models : \n", + "nlu.load('pt.embed_sentence.bert.base_legal') returns Spark NLP model_anno_obj sbert_legal_bertimbau_base_tsdae_sts\n", + "nlu.load('pt.embed_sentence.bert.cased_large_legal') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_sts_v0.1\n", + "nlu.load('pt.embed_sentence.bert.large_legal') returns Spark NLP model_anno_obj sbert_legal_bertimbau_large_gpl_sts\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_sts_v0.10.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_sts_v0.10\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_sts_v0.2.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_sts_v0.2\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_sts_v0.3.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_sts_v0.3\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_sts_v0.4.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_sts_v0.4\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_sts_v0.5.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_sts_v0.5\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_sts_v0.7.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_sts_v0.7\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_sts_v0.8.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_sts_v0.8\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_sts_v0.9.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_sts_v0.9\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_sts_v1.0.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_sts_v1.0\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_v0.11_gpl_nli_sts_v0.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_v0.11_gpl_nli_sts_v0\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_v0.11_gpl_nli_sts_v1.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_v0.11_gpl_nli_sts_v1\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_v0.11_nli_sts_v0.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_v0.11_nli_sts_v0\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_v0.11_nli_sts_v1.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_v0.11_nli_sts_v1\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_v0.11_sts_v0.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_v0.11_sts_v0\n", + "nlu.load('pt.embed_sentence.bert.legal.cased_large_mlm_v0.11_sts_v1.by_stjiris') returns Spark NLP model_anno_obj sbert_bert_large_portuguese_cased_legal_mlm_v0.11_sts_v1\n", + "nlu.load('pt.embed_sentence.bert.v2_base_legal') returns Spark NLP model_anno_obj sbert_legal_bertimbau_sts_base_ma_v2\n", + "nlu.load('pt.embed_sentence.bert.v2_large_legal') returns Spark NLP model_anno_obj sbert_legal_bertimbau_large_tsdae_sts_v2\n", + "nlu.load('pt.embed_sentence.bertimbau.legal.assin.base.by_rufimelo') returns Spark NLP model_anno_obj sbert_legal_bertimbau_sts_base_ma\n", + "nlu.load('pt.embed_sentence.bertimbau.legal.assin2.base.by_rufimelo') returns Spark NLP model_anno_obj sbert_legal_bertimbau_sts_base\n", + "nlu.load('pt.embed_sentence.bertimbau.legal.large_sts_by_rufimelo') returns Spark NLP model_anno_obj sbert_legal_bertimbau_sts_large\n", + "nlu.load('pt.embed_sentence.bertimbau.legal.large_sts_ma.by_rufimelo') returns Spark NLP model_anno_obj sbert_legal_bertimbau_sts_large_ma\n", + "nlu.load('pt.embed_sentence.bertimbau.legal.large_sts_ma_v3.by_rufimelo') returns Spark NLP model_anno_obj sbert_legal_bertimbau_sts_large_ma_v3\n", + "nlu.load('pt.embed_sentence.bertimbau.legal.large_tsdae_sts.by_rufimelo') returns Spark NLP model_anno_obj sbert_legal_bertimbau_large_tsdae_sts\n", + "nlu.load('pt.embed_sentence.bertimbau.legal.large_tsdae_sts_v4.by_rufimelo') returns Spark NLP model_anno_obj sbert_legal_bertimbau_large_tsdae_sts_v4\n", + "nlu.load('pt.embed_sentence.bertimbau.legal.large_tsdae_v4_gpl_sts.by_rufimelo') returns Spark NLP model_anno_obj sbert_legal_bertimbau_large_tsdae_v4_gpl_sts\n", + "nlu.load('pt.embed_sentence.bertimbau.legal.v2_large_sts_v2.by_rufimelo') returns Spark NLP model_anno_obj sbert_legal_bertimbau_sts_large_v2\n", + "nlu.load('pt.embed_sentence.bertimbau.legal.v2_large_v2_sts.by_rufimelo') returns Spark NLP model_anno_obj sbert_legal_bertimbau_large_v2_sts\n", + "For language NLU provides the following Models : \n", + "nlu.load('rw.embed_sentence.xlm_roberta') returns Spark NLP model_anno_obj sent_xlm_roberta_base_finetuned_kinyarwanda\n", + "For language NLU provides the following Models : \n", + "nlu.load('sv.embed_sentence.bert.base_cased') returns Spark NLP model_anno_obj sent_bert_base_cased\n", + "For language NLU provides the following Models : \n", + "nlu.load('sw.embed_sentence.xlm_roberta') returns Spark NLP model_anno_obj sent_xlm_roberta_base_finetuned_swahili\n", + "For language NLU provides the following Models : \n", + "nlu.load('wo.embed_sentence.xlm_roberta') returns Spark NLP model_anno_obj sent_xlm_roberta_base_finetuned_wolof\n", + "For language NLU provides the following Models : \n", + "nlu.load('xx.embed_sentence') returns Spark NLP model_anno_obj sent_bert_multi_cased\n", + "nlu.load('xx.embed_sentence.bert') returns Spark NLP model_anno_obj sent_bert_multi_cased\n", + "nlu.load('xx.embed_sentence.bert.cased') returns Spark NLP model_anno_obj sent_bert_multi_cased\n", + "nlu.load('xx.embed_sentence.bert.muril') returns Spark NLP model_anno_obj sent_bert_muril\n", + "nlu.load('xx.embed_sentence.bert_use_cmlm_multi_base') returns Spark NLP model_anno_obj sent_bert_use_cmlm_multi_base\n", + "nlu.load('xx.embed_sentence.bert_use_cmlm_multi_base_br') returns Spark NLP model_anno_obj sent_bert_use_cmlm_multi_base_br\n", + "nlu.load('xx.embed_sentence.labse') returns Spark NLP model_anno_obj labse\n", + "nlu.load('xx.embed_sentence.xlm_roberta.base') returns Spark NLP model_anno_obj sent_xlm_roberta_base\n", + "For language NLU provides the following Models : \n", + "nlu.load('yo.embed_sentence.xlm_roberta') returns Spark NLP model_anno_obj sent_xlm_roberta_base_finetuned_yoruba\n", + "For language NLU provides the following Models : \n", + "nlu.load('zh.embed_sentence.bert') returns Spark NLP model_anno_obj sbert_chinese_qmc_finance_v1\n", + "nlu.load('zh.embed_sentence.bert.distilled') returns Spark NLP model_anno_obj sbert_chinese_qmc_finance_v1_distill\n" + ] + } + ], + "source": [ + "nlu.print_all_model_kinds_for_action('embed_sentence')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8qTXkzr9e9Mm" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/colab/component_examples/sentence_embeddings/NLU_OpenAI_embeddings.ipynb b/examples/colab/component_examples/sentence_embeddings/NLU_OpenAI_embeddings.ipynb new file mode 100644 index 00000000..5efab2fd --- /dev/null +++ b/examples/colab/component_examples/sentence_embeddings/NLU_OpenAI_embeddings.ipynb @@ -0,0 +1,586 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "id": "DC37C6LQvAEV" + }, + "source": [ + "# **OpenAIEmbeddings**\n", + "\n", + "\n", + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples/colab/component_examples/sentence_embeddings/NLU_OpenAI_embeddings.ipynb)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "inpeJBOC6CWP" + }, + "source": [ + "**OpenAIEmbeddings** combines powers of OpenAI’s embeddings model with the robust NLP processing capabilities of Spark NLP. This integration not only ensures the utilization of OpenAI's capabilities but also capitalizes on Spark's inherent scalability advantages.\n", + "\n", + " This annotator makes direct API calls to OpenAI’s Embeddings endpoint right from datasets. This enhancement promises to elevate the efficiency and versatility of data processing workflows within Spark NLP pipelines.\n", + "\n", + "\n", + "\n", + "\n", + "We use **[OpenAIEmbeddings](https://sparknlp.org/api/python/reference/autosummary/sparknlp/annotator/openai/openai_embeddings/index.html)** annotator powered by **Spark NLP 🚀**\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kMa52GSavAEb" + }, + "outputs": [], + "source": [ + "!pip install pyspark==3.4.1 nlu" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ouKhfneQfvAh" + }, + "source": [ + "**This feature requires OPEN_API_KEY env var to be present!**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YL6yWYrnvAEc", + "is_executing": true + }, + "outputs": [], + "source": [ + "print(\"Enter your OPENAI API Key:\")\n", + "OPENAI_API_KEY = input()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "MmpxiIdpgNX1" + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wGP9cNu8vAEe", + "outputId": "0a14c73b-547c-4a86-e913-4c8efec03c4e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ], + "source": [ + "import nlu\n", + "pipe=nlu.load(\"openai.embeddings\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vA_XdY_9Bx_9", + "outputId": "7b21b7ae-bd5e-490a-e1dd-1e1ec21a17a7" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'openai_embeddings': OpenAIEmbeddings_ed1868a24d2c,\n", + " 'document_assembler': DocumentAssembler_9f1416981698}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZKw4knXOgWzx", + "outputId": "1d9b5116-1341-42d5-b353-8bc3b75e37a6" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "OpenAIEmbeddings_ed1868a24d2c" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set model ID of the OpenAI model to use\n", + "\n", + "pipe['openai_embeddings'].setModel('text-embedding-ada-002')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IokVlTt5gtst", + "outputId": "79cca406-53f8-4db9-f934-c8701952a62b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ], + "source": [ + "res=pipe.predict([\"The food was delicious and the waiter\", \"canine companions say\"], output_level='document')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 125 + }, + "id": "hFlxbpy0hM8h", + "outputId": "8bf44845-6132-468d-b636-c5cb45884ef1" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
documentsentence_embedding_openai
0The food was delicious and the waiter[0.000980676501058042, -0.004624095745384693, ...
1canine companions say[-0.00941392406821251, -0.013042356818914413, ...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " document \\\n", + "0 The food was delicious and the waiter \n", + "1 canine companions say \n", + "\n", + " sentence_embedding_openai \n", + "0 [0.000980676501058042, -0.004624095745384693, ... \n", + "1 [-0.00941392406821251, -0.013042356818914413, ... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "moSfKbzChOqw", + "outputId": "f3c23514-ce67-4db1-99d0-cc29b7b2d25c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "document: The food was delicious and the waiter\n", + "embeddings: [ 0.00098068 -0.0046241 0.00444637 ... -0.01971509 -0.01762044\n", + " 0.00992737]\n", + "\n", + "document: canine companions say\n", + "embeddings: [-0.00941392 -0.01304236 0.00682117 ... -0.00545336 -0.0044284\n", + " -0.03359871]\n", + "\n" + ] + } + ], + "source": [ + "for index, row in res.iterrows():\n", + " print(f\"document: {row['document']}\\nembeddings: {row['sentence_embedding_openai']}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kXmYcECKiN5q" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/colab/component_examples/sequence2sequence/OpenAI_completion.ipynb b/examples/colab/component_examples/sequence2sequence/OpenAI_completion.ipynb new file mode 100644 index 00000000..65b5c0a2 --- /dev/null +++ b/examples/colab/component_examples/sequence2sequence/OpenAI_completion.ipynb @@ -0,0 +1,610 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "id": "DC37C6LQvAEV" + }, + "source": [ + "# **OpenAICompletion**\n", + "\n", + "\n", + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples/colab/component_examples/sequence2sequence/OpenAI_completion.ipynb)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "inpeJBOC6CWP" + }, + "source": [ + "**OpenAICompletion** combines powers of OpenAI’s completion models with the robust NLP processing capabilities of Spark NLP. This integration not only ensures the utilization of OpenAI's capabilities but also capitalizes on Spark's inherent scalability advantages.\n", + "\n", + "This annotator makes direct API calls to OpenAI’s Completion endpoint right from datasets. This enhancement promises to elevate the efficiency and versatility of data processing workflows within Spark NLP pipelines.\n", + "\n", + "\n", + "\n", + "\n", + "We use **[OpenAICompletion](https://sparknlp.org/docs/en/transformers#openaicompletion)** annotator powered by **Spark NLP 🚀**\n", + "\n", + "Reference: [OpenAI API Doc](https://platform.openai.com/docs/api-reference/completions/create)\n", + "\n", + "Reference: [OpenAICompletion Doc](https://sparknlp.org/api/python/reference/autosummary/sparknlp/annotator/openai/openai_completion/index.html#sparknlp.annotator.openai.openai_completion.OpenAICompletion)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kMa52GSavAEb" + }, + "outputs": [], + "source": [ + "!pip install pyspark==3.4.1 nlu" + ] + }, + { + "cell_type": "markdown", + "source": [ + "**This feature requires OPEN_API_KEY env var to be present!**" + ], + "metadata": { + "id": "ouKhfneQfvAh" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YL6yWYrnvAEc", + "is_executing": true + }, + "outputs": [], + "source": [ + "print(\"Enter your OPENAI API Key:\")\n", + "OPENAI_API_KEY = input()" + ] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY" + ], + "metadata": { + "id": "MmpxiIdpgNX1" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wGP9cNu8vAEe", + "outputId": "7dc15e79-8dbd-4c4d-c17d-99c6c7b1dd4d" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ], + "source": [ + "import nlu\n", + "pipe=nlu.load(\"openai.completion\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "vA_XdY_9Bx_9", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "ba80a8fc-e55d-45ea-d27f-cb30cfa8e571" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'openai_completion': OpenAICompletion_e01e6a2d8725,\n", + " 'document_assembler': DocumentAssembler_8b4b0e9fbaa3}" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "pipe" + ] + }, + { + "cell_type": "code", + "source": [ + "# Set model ID of the OpenAI model to use\n", + "\n", + "pipe['openai_completion'].setModel('text-davinci-003')\n", + "\n", + "# Set max tokens\n", + "pipe['openai_completion'].setMaxTokens(50)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZKw4knXOgWzx", + "outputId": "e1ac3239-a777-4836-c209-b272b6c1e32c" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "OpenAICompletion_e01e6a2d8725" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "res=pipe.predict([\"Generate a restaurant review.\", \"Write a review for a local eatery.\", \"Create a JSON with a review\"], output_level='document')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IokVlTt5gtst", + "outputId": "1763fda5-068a-486c-f7bc-40e8405f3686" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "res" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "hFlxbpy0hM8h", + "outputId": "e01f74af-51a4-4e16-8711-4bd26e2c3403" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " document \\\n", + "0 Generate a restaurant review. \n", + "1 Write a review for a local eatery. \n", + "2 Create a JSON with a review \n", + "\n", + " generated \n", + "0 \\n\\nI recently had the pleasure of dining at X... \n", + "1 \\n\\nI recently had dinner at Tastee Cafe and i... \n", + "2 \\n\\n{ \\n \"Product\": \"Toy Garage\",\\n \"Rat... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
documentgenerated
0Generate a restaurant review.\\n\\nI recently had the pleasure of dining at X...
1Write a review for a local eatery.\\n\\nI recently had dinner at Tastee Cafe and i...
2Create a JSON with a review\\n\\n{ \\n \"Product\": \"Toy Garage\",\\n \"Rat...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "for index, row in res.iterrows():\n", + " print(f\"document: {row['document']}\\ngenerated: {row['generated']}\\n\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "moSfKbzChOqw", + "outputId": "75532b59-847c-48e0-ba98-c40e87f7209b" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "document: Generate a restaurant review.\n", + "generated: \n", + "\n", + "I recently had the pleasure of dining at XYZ Restaurant and I was truly impressed. The food was delicious and the service was top-notch. The atmosphere was cozy and intimate, and the menu was varied and interesting. We ordered the\n", + "\n", + "document: Write a review for a local eatery.\n", + "generated: \n", + "\n", + "I recently had dinner at Tastee Cafe and it was definitely worth it! The restaurant has a casual atmosphere, and you can tell they work hard to provide the best service and quality food. The menu has a nice variety of classic\n", + "\n", + "document: Create a JSON with a review\n", + "generated: \n", + "\n", + "{ \n", + " \"Product\": \"Toy Garage\",\n", + " \"Rating\": \"5 stars\",\n", + " \"Review\": \"I bought this toy garage for my son as a birthday present and he loves it! All the pieces are good\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "kXmYcECKiN5q" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/colab/ocr/ocr_visual_document_classifier.ipynb b/examples/colab/ocr/ocr_visual_document_classifier.ipynb new file mode 100644 index 00000000..7bd1bfa2 --- /dev/null +++ b/examples/colab/ocr/ocr_visual_document_classifier.ipynb @@ -0,0 +1,343 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" + ], + "metadata": { + "id": "Wf4-YfQC2EdS" + } + }, + { + "cell_type": "markdown", + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples/colab/ocr/ocr_visual_document_classifier.ipynb)" + ], + "metadata": { + "id": "UvR9k1032Jcu" + } + }, + { + "cell_type": "markdown", + "source": [ + "# **VisualDocumentClassifier**\n", + "\n", + "\n", + "The **VisualDocumentClassifier** is a DL model for document classification using text and layout data. The currently available pre-trained model on the Tobacco3482 dataset contains 3482 images belonging to 10 different classes (Resume, News, Note, Advertisement, Scientific, Report, Form, Letter, Email and Memo)\n", + "\n", + "**All the available models:**\n", + "\n", + "| language | nlu.load() reference | Spark NLP Model Reference |\n", + "|----------|---------------------------|----------------------------------------|\n", + "| en | en.classify_image.tabacco | visual_document_classifier_tobacco3482 |" + ], + "metadata": { + "id": "JJ6YV5Qaz3Mc" + } + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "v5dxduk42r03" + } + }, + { + "cell_type": "markdown", + "source": [ + "## **Install NLU**" + ], + "metadata": { + "id": "iSYNEpL02oh_" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_KRf7uGFz1Fv" + }, + "outputs": [], + "source": [ + "!pip install johnsnowlabs\n", + "nlp.install(visual=True,force_browser=True)\n", + "nlp.start(visual=True)" + ] + }, + { + "cell_type": "code", + "source": [ + "from johnsnowlabs import nlp,visual" + ], + "metadata": { + "id": "Gvz0XQSc2-XO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "YnqeDgTn5BSn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## **Visual Document Classifier**" + ], + "metadata": { + "id": "9PqdTFpK5Pmq" + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Parsed Nlu_ref=en.classify_image.tabacco as lang=en\n", + "21:23:18, INFO Parsed Nlu_ref=en.classify_image.tabacco as lang=en\n", + "Parsed Nlu_ref=en.classify_image.tabacco as lang=en\n", + "21:23:18, INFO Parsed Nlu_ref=en.classify_image.tabacco as lang=en\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "Warning::Spark Session already created, some configs may not take.\n", + "visual_document_classifier_tobacco3482 download started this may take some time.\n", + "Approximate size to download 398.1 MB\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Adding visual_document_classifier to internal component_list\n", + "21:23:50, INFO Adding visual_document_classifier to internal component_list\n", + "Satisfying dependencies\n", + "21:23:50, INFO Satisfying dependencies\n", + "========================================================================\n", + "21:23:50, INFO ========================================================================\n", + "Resolution Status provided_features_no_ref = {'visual_classifier_confidence', 'visual_classifier_prediction'}\n", + "21:23:50, INFO Resolution Status provided_features_no_ref = {'visual_classifier_confidence', 'visual_classifier_prediction'}\n", + "Resolution Status required_features_no_ref = {'hocr'}\n", + "21:23:50, INFO Resolution Status required_features_no_ref = {'hocr'}\n", + "Resolution Status provided_features_ref = set()\n", + "21:23:50, INFO Resolution Status provided_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "21:23:50, INFO Resolution Status required_features_ref = set()\n", + "Resolution Status is_trainable = False\n", + "21:23:50, INFO Resolution Status is_trainable = False\n", + "Resolution Status conversion_candidates = []\n", + "21:23:50, INFO Resolution Status conversion_candidates = []\n", + "Resolution Status missing_features_no_ref = {'hocr'}\n", + "21:23:50, INFO Resolution Status missing_features_no_ref = {'hocr'}\n", + "Resolution Status conversion_candidates = set()\n", + "21:23:50, INFO Resolution Status conversion_candidates = set()\n", + "========================================================================\n", + "21:23:50, INFO ========================================================================\n", + "Getting default for missing_feature_type=hocr\n", + "21:23:50, INFO Getting default for missing_feature_type=hocr\n", + "Resolved for missing components the following NLU components : []\n", + "21:23:50, INFO Resolved for missing components the following NLU components : []\n", + "adding image2hocr\n", + "21:23:50, INFO adding image2hocr\n", + "Adding image2hocr to internal component_list\n", + "21:23:50, INFO Adding image2hocr to internal component_list\n", + "========================================================================\n", + "21:23:50, INFO ========================================================================\n", + "Resolution Status provided_features_no_ref = {'hocr', 'visual_classifier_confidence', 'visual_classifier_prediction'}\n", + "21:23:50, INFO Resolution Status provided_features_no_ref = {'hocr', 'visual_classifier_confidence', 'visual_classifier_prediction'}\n", + "Resolution Status required_features_no_ref = {'hocr', 'ocr_image'}\n", + "21:23:50, INFO Resolution Status required_features_no_ref = {'hocr', 'ocr_image'}\n", + "Resolution Status provided_features_ref = set()\n", + "21:23:50, INFO Resolution Status provided_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "21:23:50, INFO Resolution Status required_features_ref = set()\n", + "Resolution Status is_trainable = False\n", + "21:23:50, INFO Resolution Status is_trainable = False\n", + "Resolution Status conversion_candidates = []\n", + "21:23:50, INFO Resolution Status conversion_candidates = []\n", + "Resolution Status missing_features_no_ref = {'ocr_image'}\n", + "21:23:50, INFO Resolution Status missing_features_no_ref = {'ocr_image'}\n", + "Resolution Status conversion_candidates = set()\n", + "21:23:50, INFO Resolution Status conversion_candidates = set()\n", + "========================================================================\n", + "21:23:50, INFO ========================================================================\n", + "Getting default for missing_feature_type=ocr_image\n", + "21:23:50, INFO Getting default for missing_feature_type=ocr_image\n", + "Resolved for missing components the following NLU components : []\n", + "21:23:50, INFO Resolved for missing components the following NLU components : []\n", + "adding binary2image\n", + "21:23:50, INFO adding binary2image\n", + "Adding binary2image to internal component_list\n", + "21:23:50, INFO Adding binary2image to internal component_list\n", + "========================================================================\n", + "21:23:50, INFO ========================================================================\n", + "Resolution Status provided_features_no_ref = {'hocr', 'visual_classifier_confidence', 'ocr_image', 'visual_classifier_prediction'}\n", + "21:23:50, INFO Resolution Status provided_features_no_ref = {'hocr', 'visual_classifier_confidence', 'ocr_image', 'visual_classifier_prediction'}\n", + "Resolution Status required_features_no_ref = {'hocr', 'ocr_image'}\n", + "21:23:50, INFO Resolution Status required_features_no_ref = {'hocr', 'ocr_image'}\n", + "Resolution Status provided_features_ref = set()\n", + "21:23:50, INFO Resolution Status provided_features_ref = set()\n", + "Resolution Status required_features_ref = set()\n", + "21:23:50, INFO Resolution Status required_features_ref = set()\n", + "Resolution Status is_trainable = False\n", + "21:23:50, INFO Resolution Status is_trainable = False\n", + "Resolution Status conversion_candidates = []\n", + "21:23:50, INFO Resolution Status conversion_candidates = []\n", + "Resolution Status missing_features_no_ref = set()\n", + "21:23:50, INFO Resolution Status missing_features_no_ref = set()\n", + "Resolution Status conversion_candidates = set()\n", + "21:23:50, INFO Resolution Status conversion_candidates = set()\n", + "========================================================================\n", + "21:23:50, INFO ========================================================================\n", + "!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!* ALL DEPENDENCIES SATISFIED !*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*\n", + "21:23:50, INFO !*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!* ALL DEPENDENCIES SATISFIED !*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*\n", + "Fixing column names\n", + "21:23:50, INFO Fixing column names\n", + "Fixing input and output column names\n", + "21:23:50, INFO Fixing input and output column names\n", + "Checking for component_to_resolve visual_document_classifier wether inputs {'hocr'} is satisfied by another component_to_resolve in the component_list \n", + "21:23:50, INFO Checking for component_to_resolve visual_document_classifier wether inputs {'hocr'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve image2hocr wether inputs {'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "21:23:50, INFO Checking for component_to_resolve image2hocr wether inputs {'ocr_image'} is satisfied by another component_to_resolve in the component_list \n", + "Checking for component_to_resolve binary2image wether inputs {'content', 'path'} is satisfied by another component_to_resolve in the component_list \n", + "21:23:50, INFO Checking for component_to_resolve binary2image wether inputs {'content', 'path'} is satisfied by another component_to_resolve in the component_list \n", + "Optimizing component_list component_to_resolve order\n", + "21:23:50, INFO Optimizing component_list component_to_resolve order\n", + "Starting to optimize component_to_resolve order \n", + "21:23:50, INFO Starting to optimize component_to_resolve order \n", + "Optimizing order for component_to_resolve visual_document_classifier\n", + "21:23:50, INFO Optimizing order for component_to_resolve visual_document_classifier\n", + "Optimizing order for component_to_resolve image2hocr\n", + "21:23:50, INFO Optimizing order for component_to_resolve image2hocr\n", + "Optimizing order for component_to_resolve binary2image\n", + "21:23:50, INFO Optimizing order for component_to_resolve binary2image\n", + "Optimizing order for component_to_resolve visual_document_classifier\n", + "21:23:50, INFO Optimizing order for component_to_resolve visual_document_classifier\n", + "Optimizing order for component_to_resolve image2hocr\n", + "21:23:50, INFO Optimizing order for component_to_resolve image2hocr\n", + "Optimizing order for component_to_resolve visual_document_classifier\n", + "21:23:50, INFO Optimizing order for component_to_resolve visual_document_classifier\n", + "Optimizing order for component_to_resolve image2hocr\n", + "21:23:50, INFO Optimizing order for component_to_resolve image2hocr\n", + "Optimizing order for component_to_resolve visual_document_classifier\n", + "21:23:50, INFO Optimizing order for component_to_resolve visual_document_classifier\n", + "Optimizing order for component_to_resolve visual_document_classifier\n", + "21:23:50, INFO Optimizing order for component_to_resolve visual_document_classifier\n", + "Renaming duplicates cols\n", + "21:23:50, INFO Renaming duplicates cols\n", + "Done with component_list optimizing\n", + "21:23:50, INFO Done with component_list optimizing\n", + "Fitting on empty Dataframe, could not infer correct training method. This is intended for non-trainable pipelines.\n", + "21:23:50, INFO Fitting on empty Dataframe, could not infer correct training method. This is intended for non-trainable pipelines.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Configuring Light Pipeline Usage\n", + "21:23:52, INFO Configuring Light Pipeline Usage\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Inferred and set output level of pipeline to document\n", + "21:23:53, INFO Inferred and set output level of pipeline to document\n", + "Extracting for same_level_cols = ['text']\n", + "\n", + "21:24:07, INFO Extracting for same_level_cols = ['text']\n", + "\n" + ] + } + ], + "source": [ + "p = nlu.load('en.classify_image.tabacco',verbose=True)\n", + "res = p.predict('cv_test.png')" + ], + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-02T15:54:07.633782200Z", + "start_time": "2024-01-02T15:53:18.012906100Z" + }, + "id": "IS6wleBN4ynd", + "outputId": "6e8ebc43-92c3-4821-a495-6c515962152e" + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [ + { + "data": { + "text/plain": " file_path \\\n0 file:/F:/Work/repos/nlu/tests/nlu_ocr_tests/cv... \n\n visual_classifier_confidence visual_classifier_prediction \n0 0.990776 Resume ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
file_pathvisual_classifier_confidencevisual_classifier_prediction
0file:/F:/Work/repos/nlu/tests/nlu_ocr_tests/cv...0.990776Resume
\n
" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res" + ], + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-02T15:54:28.984492300Z", + "start_time": "2024-01-02T15:54:28.925420500Z" + }, + "id": "FxOSw_JS4ynd", + "outputId": "392b617e-cb07-42e5-814f-81b3e8c8a0d6" + } + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "x-9XHEyd2qvy" + } + } + ] +} \ No newline at end of file diff --git a/nlu/__init__.py b/nlu/__init__.py index 1e924ff1..27876f47 100644 --- a/nlu/__init__.py +++ b/nlu/__init__.py @@ -1,4 +1,4 @@ -__version__ = '5.1.4' +__version__ = '5.1.5rc19' import nlu.utils.environment.env_utils as env_utils @@ -178,7 +178,7 @@ def to_nlu_pipe(nlp_pipe: Union[Pipeline, LightPipeline, PipelineModel, List], i def load(request: str = 'from_disk', path: Optional[str] = None, verbose: bool = False, gpu: bool = False, streamlit_caching: bool = False, - m1_chip: bool = False + apple_silicon: bool = False ) -> NLUPipeline: ''' Load either a prebuild pipeline or a set of components identified by a whitespace seperated list of components @@ -197,7 +197,10 @@ def load(request: str = 'from_disk', path: Optional[str] = None, verbose: bool = return nlu.load(request, path, verbose, gpu, streamlit_caching) # check if secrets are in default loc, if yes load them and create licensed context automatically auth(gpu=gpu) - spark = get_open_source_spark_context(gpu, m1_chip) + if request.startswith("openai"): + spark = get_open_source_spark_context_with_openai(gpu,apple_silicon) + else: + spark = get_open_source_spark_context(gpu, apple_silicon) # spark.catalog.clearCache() if verbose: @@ -362,16 +365,31 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline: raise ValueError -def get_open_source_spark_context(gpu, m1_chip): +def get_open_source_spark_context(gpu, apple_silicon): if env_utils.is_env_pyspark_3_x(): - if m1_chip: - return sparknlp.start(gpu=gpu, m1=True) + if apple_silicon: + return sparknlp.start(gpu=gpu, apple_silicon=True) else: return sparknlp.start(gpu=gpu) raise ValueError(f"Failure starting Spark Context! Current Spark version {get_pyspark_version()} not supported! " f"Please install any of Pyspark 3.X versions.") +def get_open_source_spark_context_with_openai(gpu, apple_silicon): + + if env_utils.is_env_pyspark_3_x(): + OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') + + if OPENAI_API_KEY: + openai_params = {"spark.jsl.settings.openai.api.key": OPENAI_API_KEY} + if apple_silicon: + return sparknlp.start(gpu=gpu, apple_silicon=True, params=openai_params) + else: + return sparknlp.start(gpu=gpu, params=openai_params) + else: + raise Exception("This feature requires OPEN_API_KEY env var to be present!") + raise ValueError(f"Failure starting Spark Context! Current Spark version {get_pyspark_version()} not supported! " + f"Please install any of Pyspark 3.X versions.") def enable_verbose() -> None: logger.setLevel(logging.INFO) ch = logging.StreamHandler() diff --git a/nlu/ocr_components/visual_classifiers/visual_doc_classifier/__init__.py b/nlu/components/classifiers/bart_zero_shot_classification/__init__.py similarity index 100% rename from nlu/ocr_components/visual_classifiers/visual_doc_classifier/__init__.py rename to nlu/components/classifiers/bart_zero_shot_classification/__init__.py diff --git a/nlu/components/classifiers/bart_zero_shot_classification/bart_zero_shot.py b/nlu/components/classifiers/bart_zero_shot_classification/bart_zero_shot.py new file mode 100644 index 00000000..2481ea05 --- /dev/null +++ b/nlu/components/classifiers/bart_zero_shot_classification/bart_zero_shot.py @@ -0,0 +1,17 @@ +from sparknlp.annotator import * + + +class BartZeroShotClassifier: + @staticmethod + def get_default_model(): + return BartForZeroShotClassification.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("category") \ + .setCaseSensitive(True) + + @staticmethod + def get_pretrained_model(name, language, bucket=None): + return BartForZeroShotClassification.pretrained(name, language, bucket) \ + .setInputCols(["token", "document"]) \ + .setOutputCol("category") \ + .setCaseSensitive(True) diff --git a/nlu/components/classifiers/deberta_zero_shot/__init__.py b/nlu/components/classifiers/deberta_zero_shot/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/components/classifiers/deberta_zero_shot/deberta_zero_shot.py b/nlu/components/classifiers/deberta_zero_shot/deberta_zero_shot.py new file mode 100644 index 00000000..40778e97 --- /dev/null +++ b/nlu/components/classifiers/deberta_zero_shot/deberta_zero_shot.py @@ -0,0 +1,17 @@ +from sparknlp.annotator import DeBertaForZeroShotClassification + + +class DeBertaZeroShotClassifier: + @staticmethod + def get_default_model(): + return DeBertaForZeroShotClassification.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("category") \ + .setCaseSensitive(True) + + @staticmethod + def get_pretrained_model(name, language, bucket=None): + return DeBertaForZeroShotClassification.pretrained(name, language, bucket) \ + .setInputCols(["token", "document"]) \ + .setOutputCol("category") \ + .setCaseSensitive(True) \ No newline at end of file diff --git a/nlu/components/classifiers/xlm_roberta_zero_shot_classification/__init__.py b/nlu/components/classifiers/xlm_roberta_zero_shot_classification/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/components/classifiers/xlm_roberta_zero_shot_classification/xlm_roberta_zero_shot.py b/nlu/components/classifiers/xlm_roberta_zero_shot_classification/xlm_roberta_zero_shot.py new file mode 100644 index 00000000..29dcfbff --- /dev/null +++ b/nlu/components/classifiers/xlm_roberta_zero_shot_classification/xlm_roberta_zero_shot.py @@ -0,0 +1,17 @@ +from sparknlp.annotator import * + + +class XlmRobertaZeroShotClassifier: + @staticmethod + def get_default_model(): + return XlmRoBertaForZeroShotClassification.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("category") \ + .setCaseSensitive(True) + + @staticmethod + def get_pretrained_model(name, language, bucket=None): + return XlmRoBertaForZeroShotClassification.pretrained(name, language, bucket) \ + .setInputCols(["token", "document"]) \ + .setOutputCol("category") \ + .setCaseSensitive(True) diff --git a/nlu/components/embeddings/openai_embeddings/__init__.py b/nlu/components/embeddings/openai_embeddings/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/components/embeddings/openai_embeddings/openai_embeddings.py b/nlu/components/embeddings/openai_embeddings/openai_embeddings.py new file mode 100644 index 00000000..73e9b8b8 --- /dev/null +++ b/nlu/components/embeddings/openai_embeddings/openai_embeddings.py @@ -0,0 +1,19 @@ +from sparknlp.annotator import * + +class OpenaiEmbeddings: + @staticmethod + def get_default_model(): + return OpenAIEmbeddings() \ + .setInputCols("document") \ + .setOutputCol("embeddings") + + + @staticmethod + def get_pretrained_model(name, language, bucket=None): + return OpenAIEmbeddings() \ + .setInputCols("document") \ + .setOutputCol("embeddings") + + + + diff --git a/nlu/components/embeddings/sentence_bge/BGESentenceEmbedding.py b/nlu/components/embeddings/sentence_bge/BGESentenceEmbedding.py new file mode 100644 index 00000000..74e8005b --- /dev/null +++ b/nlu/components/embeddings/sentence_bge/BGESentenceEmbedding.py @@ -0,0 +1,16 @@ +import sparknlp +from sparknlp.annotator import BGEEmbeddings + + +class BGE: + @staticmethod + def get_default_model(): + return BGEEmbeddings.pretrained() \ + .setInputCols(["document"]) \ + .setOutputCol("bge_embeddings") + + @staticmethod + def get_pretrained_model(name, language, bucket=None): + return BGEEmbeddings.pretrained(name,language,bucket) \ + .setInputCols(["document"]) \ + .setOutputCol("bge_embeddings") diff --git a/nlu/components/embeddings/sentence_bge/__init__.py b/nlu/components/embeddings/sentence_bge/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/components/seq2seqs/openai_completion/__init__.py b/nlu/components/seq2seqs/openai_completion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/components/seq2seqs/openai_completion/openai_completion.py b/nlu/components/seq2seqs/openai_completion/openai_completion.py new file mode 100644 index 00000000..3eb63017 --- /dev/null +++ b/nlu/components/seq2seqs/openai_completion/openai_completion.py @@ -0,0 +1,19 @@ +from sparknlp.annotator import * + +class OpenaiCompletion: + @staticmethod + def get_default_model(): + return OpenAICompletion() \ + .setInputCols("document") \ + .setOutputCol("completion") + + + @staticmethod + def get_pretrained_model(name, language, bucket=None): + return OpenAICompletion() \ + .setInputCols("document") \ + .setOutputCol("completion") + + + + diff --git a/nlu/ocr_components/visual_classifiers/visual_document_classifier/__init__.py b/nlu/ocr_components/visual_classifiers/visual_document_classifier/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nlu/ocr_components/visual_classifiers/visual_document_classifier/visual_document_classifier.py b/nlu/ocr_components/visual_classifiers/visual_document_classifier/visual_document_classifier.py new file mode 100644 index 00000000..183d0042 --- /dev/null +++ b/nlu/ocr_components/visual_classifiers/visual_document_classifier/visual_document_classifier.py @@ -0,0 +1,9 @@ +class VisualDocClassifier: + @staticmethod + def get_default_model(): + from sparkocr.transformers import VisualDocumentClassifier + return VisualDocumentClassifier.pretrained("visual_document_classifier_tobacco3482", "en", "clinical/ocr") \ + .setMaxSentenceLength(128) \ + .setInputCol("hocr") \ + .setLabelCol("prediction") \ + .setConfidenceCol("conf") diff --git a/nlu/pipe/col_substitution/col_name_substitution_utils.py b/nlu/pipe/col_substitution/col_name_substitution_utils.py index 77c05cdb..a5f73275 100644 --- a/nlu/pipe/col_substitution/col_name_substitution_utils.py +++ b/nlu/pipe/col_substitution/col_name_substitution_utils.py @@ -14,6 +14,11 @@ import nlu from nlu.pipe.col_substitution import substitution_map_OS +from nlu.universe.feature_universes import NLP_FEATURES +from nlu.pipe.col_substitution import substitution_map_OS +from nlu.pipe.col_substitution import col_substitution_OS +import logging + from nlu.pipe.extractors.extractor_base_data_classes import SparkOCRExtractorConfig from nlu.universe.feature_universes import NLP_FEATURES from nlu.universe.logic_universes import AnnoTypes @@ -68,12 +73,19 @@ def substitute_col_names(df, anno_2_ex, pipe, stranger_cols=[], get_embeddings=F deducted_component_names = ColSubstitutionUtils.deduct_component_names(pipe) for c in pipe.components: if c.license == Licenses.ocr: + from nlu.pipe.col_substitution import substitution_map_OCR # TODO better substitution old2new_anno_cols = {k: k for k in c.spark_output_column_names} anno2final_cols[c.model] = list(old2new_anno_cols.values()) new_cols.update(old2new_anno_cols) new_cols = {**new_cols, **(old2new_anno_cols)} - continue + if type(c.model) in substitution_map_OCR.OCR_anno2substitution_fn.keys(): + cols = df.columns.tolist() + substitution_fn = substitution_map_OCR.OCR_anno2substitution_fn[type(c.model)]['default'] + old2new_anno_cols = substitution_fn(c, cols, deducted_component_names[c]) + anno2final_cols[c.model] = list(old2new_anno_cols.values()) + new_cols = {**new_cols, **(old2new_anno_cols)} + continue if 'embedding' in c.type and get_embeddings == False: continue cols_to_substitute = ColSubstitutionUtils.get_final_output_cols_of_component(c, df, anno_2_ex) if len(cols_to_substitute) == 0: @@ -93,6 +105,7 @@ def substitute_col_names(df, anno_2_ex, pipe, stranger_cols=[], get_embeddings=F anno2final_cols[c.model] = list(old2new_anno_cols.values()) new_cols.update(old2new_anno_cols) continue + # dic, key=old_col, value=new_col. Some cols may be omitted and missing from the dic which are deemed irrelevant. Behaivour can be disabled by setting drop_debug_cols=False old2new_anno_cols = substitution_fn(c, cols_to_substitute, deducted_component_names[c]) anno2final_cols[c.model] = list(old2new_anno_cols.values()) diff --git a/nlu/pipe/col_substitution/col_substitution_OCR.py b/nlu/pipe/col_substitution/col_substitution_OCR.py index a9240365..dc390e93 100644 --- a/nlu/pipe/col_substitution/col_substitution_OCR.py +++ b/nlu/pipe/col_substitution/col_substitution_OCR.py @@ -17,6 +17,28 @@ def substitute_recognized_text_cols(c, cols, is_unique=True, nlu_identifier=''): for c in cols: new_cols[c] = c return new_cols # TODO + +def substitute_document_classifier_text_cols(c, cols, is_unique=True, nlu_identifier=''): + """ + Drug Norm is always unique + Fetched fields are: + - entities@_results + - entities@_ + - entities@_entity + - entities@_confidence + """ + new_cols = {} + for c in cols: + if 'visual_classifier_label.1' in cols: + new_cols['visual_classifier_label.1'] = 'file_path' + if 'visual_classifier_label' in cols: + new_cols['visual_classifier_label'] = 'visual_classifier_prediction' + + new_cols[c] = c + return new_cols # TODO + + + # new_base_name = 'generic_classifier' if is_unique else f'generic_classification_{nlu_identifier}' # for col in cols : # if '_results' in col : new_cols[col] = new_base_name diff --git a/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py b/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py index 882e34f7..fb2da7fa 100644 --- a/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py +++ b/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py @@ -20,6 +20,8 @@ WordEmbeddingsModel , ElmoEmbeddings , E5Embeddings, + BGEEmbeddings, + OpenAIEmbeddings, BertSentenceEmbeddings, RoBertaSentenceEmbeddings, UniversalSentenceEncoder, @@ -88,6 +90,8 @@ WordEmbeddingsModel , ElmoEmbeddings , E5Embeddings, + BGEEmbeddings, + OpenAIEmbeddings, BertSentenceEmbeddings, RoBertaSentenceEmbeddings, InstructorEmbeddings, diff --git a/nlu/pipe/col_substitution/substitution_map_OCR.py b/nlu/pipe/col_substitution/substitution_map_OCR.py new file mode 100644 index 00000000..8a8922dc --- /dev/null +++ b/nlu/pipe/col_substitution/substitution_map_OCR.py @@ -0,0 +1,30 @@ +""" +Resolve Annotator Classes in the Pipeline to Extractor Configs and Methods + +Every Annotator should have 2 configs. Some might offor multuple configs/method pairs, based on model_anno_obj/NLP reference. +- default/minimalistic -> Just the results of the annotations, no confidences or extra metadata +- with meta -> A config that leverages white/black list and gets the most relevant metadata +- with positions -> With Begins/Ends +- with sentence references -> Reeturn the sentence/chunk no. reference from the metadata. + If a document has multi-sentences, this will map a label back to a corrosponding sentence + +""" +# from nlu.pipe.col_substitution.col_substitution_HC import * +from nlu.pipe.col_substitution.col_substitution_OS import * +from nlu.pipe.col_substitution.col_substitution_OCR import * + +from sparkocr.transformers import * + +OCR_anno2substitution_fn = { + VisualDocumentClassifier : { + 'default': substitute_document_classifier_text_cols , + }, + +} + + + + + + + diff --git a/nlu/pipe/col_substitution/substitution_map_OS.py b/nlu/pipe/col_substitution/substitution_map_OS.py index 326ec49b..46e854cd 100644 --- a/nlu/pipe/col_substitution/substitution_map_OS.py +++ b/nlu/pipe/col_substitution/substitution_map_OS.py @@ -48,7 +48,13 @@ 'default': substitute_word_embed_cols, }, E5Embeddings: { - 'default': substitute_word_embed_cols, + 'default': substitute_sent_embed_cols, + }, + OpenAIEmbeddings: { + 'default': substitute_sent_embed_cols, + }, + BGEEmbeddings: { + 'default': substitute_sent_embed_cols, }, BertSentenceEmbeddings: { 'default': substitute_sent_embed_cols, diff --git a/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py b/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py index d6a4659f..0115c831 100644 --- a/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py +++ b/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py @@ -362,6 +362,7 @@ def zip_and_explode(df: pd.DataFrame, cols_to_explode: List[str]) -> pd.DataFram Elements of columns which are not in cols_to_explode, will be in lists """ # Check cols we want to explode actually exist, if no data extracted cols can be missing + # print(df) missing = [] for col in cols_to_explode: if col not in df.columns: @@ -369,7 +370,18 @@ def zip_and_explode(df: pd.DataFrame, cols_to_explode: List[str]) -> pd.DataFram for miss in missing: cols_to_explode.remove(miss) # Drop duplicate cols - df = df.loc[:, ~df.columns.duplicated()] + # df = df.loc[:, ~df.columns.duplicated()] + if df.columns.duplicated().any(): + # If there are duplicate column names, append a suffix to make them unique + cols = pd.Series(df.columns) + for dup in cols[cols.duplicated()].unique(): + cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in + range(sum(cols == dup))] + df.columns = cols + else: + # If there are no duplicate column names, remove duplicate columns + df = df.loc[:, ~df.columns.duplicated()] + if len(cols_to_explode) > 0: # We must pad all cols we want to explode to the same length because pandas limitation. # Spark API does not require this since it handles cols with not same length by creating nan. We do it ourselves here manually diff --git a/nlu/pipe/extractors/extractor_methods/ocr_extractors.py b/nlu/pipe/extractors/extractor_methods/ocr_extractors.py index e284c1b8..fb9b6a73 100644 --- a/nlu/pipe/extractors/extractor_methods/ocr_extractors.py +++ b/nlu/pipe/extractors/extractor_methods/ocr_extractors.py @@ -57,4 +57,5 @@ def use_first_row_as_column_names_for_list_of_dfs(pd_tables): new_tables = [] for t in pd_tables: new_tables.append(use_first_row_as_column_names(t)) + # print(new_tables) return new_tables diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py index 966d03f7..b00f95a8 100644 --- a/nlu/pipe/pipeline.py +++ b/nlu/pipe/pipeline.py @@ -281,6 +281,12 @@ def unpack_and_apply_extractors(self, pdf: Union[pyspark.sql.DataFrame, pd.DataF Can process Spark DF output from Vanilla pipes and Pandas Converts outputs of Lightpipeline """ + if isinstance(pdf,pyspark.sql.dataframe.DataFrame): + if 'modificationTime' in pdf.columns: + # drop because of + # 'TypeError: Casting to unit-less dtype 'datetime64' is not supported. + # Pass e.g. 'datetime64[ns]' instead. processed' + pdf = pdf.drop('modificationTime') # Light pipe, does not fetch emebddings if light_pipe_enabled and not get_embeddings and not isinstance(pdf, pyspark.sql.dataframe.DataFrame) or self.prefer_light: @@ -490,7 +496,27 @@ def predict(self, from nlu.pipe.utils.predict_helper import __predict__ return __predict__(self, data, output_level, positions, keep_stranger_features, metadata, multithread, drop_irrelevant_cols, return_spark_df, get_embeddings) + def predict_embeds(self, + data, + multithread=True, + return_spark_df=False, + ): + ''' + Annotates a Pandas Dataframe/Pandas Series/Numpy Array/Spark DataFrame/Python List strings /Python String abd returns List of Floats or Spark-Df, only with embeddings. + :param data: Data to predict on + and drop_irrelevant_cols = True then chunk, sentence and Doc will be dropped + :param return_spark_df: Prediction results will be returned right after transforming with the Spark NLP pipeline + This will run fully distributed in on the Spark Master, but not prettify the output dataframe + :param return_spark_df: return Spark-DF and not collect all data into driver instead of returning list of float + :param multithread: Use multithreaded Light-pipeline instead of spark-pipeline + :return: + ''' + from nlu.pipe.utils.predict_helper import __predict__ + return __predict__(self, data, output_level=None, positions=False, keep_stranger_features=False, metadata=False, + multithread=multithread, + drop_irrelevant_cols=True, return_spark_df=return_spark_df, get_embeddings=True, + embed_only=True) def print_info(self, minimal=True): ''' Print out information about every component_to_resolve currently loaded in the component_list and their configurable parameters. diff --git a/nlu/pipe/utils/audio_data_conversion_utils.py b/nlu/pipe/utils/audio_data_conversion_utils.py index 4369d2d7..afdf0777 100644 --- a/nlu/pipe/utils/audio_data_conversion_utils.py +++ b/nlu/pipe/utils/audio_data_conversion_utils.py @@ -1,11 +1,13 @@ -from pyspark.sql.types import * import glob import logging import os from typing import List -import pyspark + import numpy as np import pandas as pd +import pyspark +from johnsnowlabs.utils.env_utils import is_running_in_databricks +from pyspark.sql.types import * logger = logging.getLogger('nlu') @@ -32,6 +34,8 @@ def validate_paths(data): @staticmethod def check_iterable_paths_are_valid(iterable_paths): """Validate for iterable data input if all elements point to file or jsl_folder""" + if is_running_in_databricks(): + iterable_paths = [f'/dbfs{p}' for p in iterable_paths] paths_validness = [] for p in iterable_paths: if os.path.isdir(p) or os.path.isfile(p): @@ -82,14 +86,18 @@ def glob_files_of_accepted_type(paths, file_types): 1. paths point to a file which is suffixed with one of the accepted file_types, i.e. path/to/file.type 2. path points to a jsl_folder, in this case jsl_folder is recursively searched for valid files and accepted paths will be in return result """ + if is_running_in_databricks(): + paths = [f'/dbfs{p}' for p in paths] accepted_file_paths = [] for p in paths: for t in file_types: t = t.lower() - if os.path.isfile(p): + if os.path.isfile(p) or is_running_in_databricks() and os.path.isfile(f'/dbfs{p}'): if p.lower().split('.')[-1] == t: + if is_running_in_databricks(): + p = p.replace('/dbfs', '', 1) accepted_file_paths.append(p) - elif os.path.isdir(p): + elif os.path.isdir(p) or is_running_in_databricks() and os.path.isdir(f'/dbfs{p}'): accepted_file_paths += glob.glob(p + f'/**/*.{t}', recursive=True) else: print(f"Invalid path = {p} pointing neither to file or jsl_folder on this machine") diff --git a/nlu/pipe/utils/ocr_data_conversion_utils.py b/nlu/pipe/utils/ocr_data_conversion_utils.py index 504de3fd..44b4fd2a 100644 --- a/nlu/pipe/utils/ocr_data_conversion_utils.py +++ b/nlu/pipe/utils/ocr_data_conversion_utils.py @@ -3,9 +3,11 @@ import logging import os from typing import List -import pyspark + import numpy as np import pandas as pd +import pyspark +from johnsnowlabs.utils.env_utils import is_running_in_databricks logger = logging.getLogger('nlu') @@ -30,6 +32,8 @@ def validate_OCR_compatible_inputs(data): @staticmethod def check_iterable_paths_are_valid(iterable_paths): """Validate for iterable data input if all elements point to file or jsl_folder""" + if is_running_in_databricks(): + iterable_paths = [f'/dbfs{p}' for p in iterable_paths] paths_validness = [] for p in iterable_paths: if os.path.isdir(p) or os.path.isfile(p): @@ -53,15 +57,21 @@ def glob_files_of_accepted_type(paths, file_types): 1. paths point to a file which is suffixed with one of the accepted file_types, i.e. path/to/file.type 2. path points to a jsl_folder, in this case jsl_folder is recurisvely searched for valid files and accepted paths will be in return result """ + + if is_running_in_databricks(): + paths = [f'/dbfs{p}' for p in paths] accepted_file_paths = [] for p in paths: for t in file_types: t = t.lower() - if os.path.isfile(p): + if os.path.isfile(p) or is_running_in_databricks() and os.path.isfile(f'/dbfs{p}'): if p.lower().split('.')[-1] == t: + if is_running_in_databricks(): + p = p.replace('/dbfs', '', 1) accepted_file_paths.append(p) - elif os.path.isdir(p): - accepted_file_paths += glob.glob(p + f'/*.{t.upper()}', recursive=True) + glob.glob(p + f'/*.{t}', recursive=True) + elif os.path.isdir(p) or is_running_in_databricks() and os.path.isdir(f'/dbfs{p}'): + accepted_file_paths += glob.glob(p + f'/*.{t.upper()}', recursive=True) + glob.glob(p + f'/*.{t}', + recursive=True) else: print(f"Invalid path = {p} pointing neither to file or jsl_folder on this machine") return accepted_file_paths diff --git a/nlu/pipe/utils/predict_helper.py b/nlu/pipe/utils/predict_helper.py index 9a2cb4de..ef8db20c 100644 --- a/nlu/pipe/utils/predict_helper.py +++ b/nlu/pipe/utils/predict_helper.py @@ -217,18 +217,16 @@ def __db_endpoint_predict__(pipe, data): else: params = {} files = [] - if 'file' in data.columns and 'file_type' in data.columns: - print("DETECTED FILE COLS") - skip_first = PredictParams.has_param_cols(data) + if 'binary_file' in data.columns and 'file_name' in data.columns: + # skip_first = PredictParams.has_param_cols(data) for i, row in data.iterrows(): - print(f"DESERIALIZING {row.file_type} file {row.file}") - if i == 0 and skip_first: - continue - file_name = f'file{i}.{row.file_type}' - files.append(file_name) - deserialize(row.file, file_name) + # print(f"DESERIALIZING {row.file_type} file {row.file}") + # if i == 0 and skip_first: + # continue + files.append(row.file_name) + deserialize(row.binary_file, row.file_name) + # data is now list of file path data = files - if params: return __predict__(pipe, data, **params, normal_pred_on_db=True) else: @@ -259,6 +257,15 @@ def __predict_standard_spark_only_embed(pipe, data, return_spark_df): # Note, this is only document output level. If pipe has sentence detector, we will only keep first embed of every document. return [r.embeddings[0] for r in data.select(f'{emb_col}.embeddings').collect()] +def try_update_session(): + try: + import sparknlp + spark = sparknlp.start() + spark._jvm.com.johnsnowlabs.license.LicenseValidator.meterServingUsage( + spark._jvm.scala.Option.apply(None) + ) + except Exception as e: + print(f"Error updating session: {e}") def __predict__(pipe, data, output_level, positions, keep_stranger_features, metadata, multithread, drop_irrelevant_cols, return_spark_df, get_embeddings, embed_only=False,normal_pred_on_db=False): @@ -274,12 +281,21 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met :param return_spark_df: Prediction results will be returned right after transforming with the Spark NLP pipeline :return: ''' + if embed_only: pipe.fit() return __predict_standard_spark_only_embed(pipe, data, return_spark_df) if 'DB_ENDPOINT_ENV' in os.environ and not normal_pred_on_db: - return __db_endpoint_predict__(pipe,data) + + try_update_session() + df = __db_endpoint_predict__(pipe,data) + if isinstance(df, pd.DataFrame): + if 'output_level' in df.columns: + df = df.drop(columns=['output_level']) + if PipeUtils.has_table_extractor(pipe): + return {'tables': df} + return df if output_level == '' and not pipe.has_table_qa_models: # Default sentence level for all components @@ -294,8 +310,7 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met pipe.components = PipeUtils.configure_component_output_levels(pipe, output_level) elif pipe.has_nlp_components and output_level in ['token']: - # Add tokenizer if not in pipe, default its inputs to sentence - pipe.component_output_level = 'sentence' + # Add tokenizer if not iadel = 'sentence' pipe.components = PipeUtils.configure_component_output_levels(pipe, 'sentence') pipe = PipeUtils.add_tokenizer_to_pipe_if_missing(pipe) diff --git a/nlu/spellbook.py b/nlu/spellbook.py index 58d386e8..8637bd78 100644 --- a/nlu/spellbook.py +++ b/nlu/spellbook.py @@ -197,6 +197,8 @@ class Spellbook: 'grammar_correctness': ('t5_base', 'model_anno_obj', {'setTask': '"cola sentence: "'}), 'answer_question': ('t5_base', 'model_anno_obj', {'setTask': '"question: "'}), 'classify.sentiment_t5': ('t5_base', 'model_anno_obj', {'setTask': '"sst2 sentence: "'}), + 'openai.completion': ('openai.completion', 'model_anno_obj'), + 'openai.embeddings': ('openai.embeddings', 'model_anno_obj'), } # multi lang models @@ -2437,8 +2439,10 @@ class Spellbook: 'eml': {'eml.embed.w2v_cc_300d': 'w2v_cc_300d'}, 'en': { 'en.distilbert.zero_shot_classifier': 'distilbert_base_zero_shot_classifier_uncased_mnli', + 'en.deberta.zero_shot_classifier': 'deberta_base_zero_shot_classifier_mnli_anli_v3', 'en.classify_image.convnext.tiny': 'image_classifier_convnext_tiny_224_local', 'en.bert.zero_shot_classifier': 'bert_base_cased_zero_shot_classifier_xnli', + 'en.bart.zero_shot_classifier': 'bart_large_zero_shot_classifier_mnli', 'en.roberta.zero_shot_classifier': 'roberta_base_zero_shot_classifier_nli', 'en.seq2seq.distilbart_xsum_12_6': 'distilbart_xsum_12_6', 'en.seq2seq.bart_large_cnn': 'bart_large_cnn', @@ -4775,6 +4779,9 @@ class Spellbook: 'en.embed_sentence.biobert.pubmed_base_cased': 'sent_biobert_pubmed_base_cased', 'en.embed_sentence.biobert.pubmed_large_cased': 'sent_biobert_pubmed_large_cased', 'en.embed_sentence.biobert.pubmed_pmc_base_cased': 'sent_biobert_pubmed_pmc_base_cased', + 'en.embed_sentence.bge_base': 'bge_base', + 'en.embed_sentence.bge_small': 'bge_small', + 'en.embed_sentence.bge_large': 'bge_large', 'en.embed_sentence.covidbert.large_uncased': 'sent_covidbert_large_uncased', 'en.embed_sentence.distil_roberta.distilled_base': 'sent_distilroberta_base', 'en.embed_sentence.doc2vec': 'doc2vec_gigaword_300', @@ -5714,6 +5721,7 @@ class Spellbook: 'en.ner.distil_roberta.wikiann.distilled_base': 'roberta_ner_distilroberta_base_ner_wikiann', 'en.ner.dl': 'ner_dl', 'en.ner.dl.bert': 'ner_dl_bert', + 'en.ner.dl.protein_glove': 'ner_protein_glove', 'en.ner.farbrbert.base.by_giggio': 'bert_ner_farbrbert_base', 'en.ner.farbrbert.base_75.by_giggio': 'bert_ner_far75brbert_base', 'en.ner.fewnerd': 'nerdl_fewnerd_100d', @@ -10382,6 +10390,7 @@ class Spellbook: 'xx.xh.marian.translate_to.fi': 'opus_mt_fi_xh', 'xx.xh.marian.translate_to.fr': 'opus_mt_fr_xh', 'xx.xh.marian.translate_to.sv': 'opus_mt_sv_xh', + 'xx.xlm_roberta.zero_shot_classifier': 'xlm_roberta_large_zero_shot_classifier_xnli_anli', 'xx.yap.marian.translate_to.en': 'opus_mt_yap_en', 'xx.yap.marian.translate_to.fi': 'opus_mt_fi_yap', 'xx.yap.marian.translate_to.fr': 'opus_mt_fr_yap', @@ -12859,6 +12868,7 @@ class Spellbook: 'cpu_transport_ghg_classifier': 'MPNetEmbeddings', 'cross_all_mpnet_base_v2_finetuned_webnlg2020_metric_average': 'MPNetEmbeddings', 'distilbert_base_zero_shot_classifier_uncased_mnli': 'DistilBertForZeroShotClassification', + 'deberta_base_zero_shot_classifier_mnli_anli_v3': 'DeBertaForZeroShotClassification', 'distilbert_base_zero_shot_classifier_turkish_cased_multinli': 'DistilBertForZeroShotClassification', 'distilbert_base_zero_shot_classifier_turkish_cased_allnli': 'DistilBertForZeroShotClassification', 'distilbert_base_zero_shot_classifier_turkish_cased_snli': 'DistilBertForZeroShotClassification', @@ -12866,6 +12876,7 @@ class Spellbook: 'due_eshop_21': 'MPNetEmbeddings', 'due_eshop_21_multilabel': 'MPNetEmbeddings', 'due_retail_25': 'MPNetEmbeddings', + 'bart_large_zero_shot_classifier_mnli': 'BartForZeroShotClassification', 'bert_base_sequence_classifier_imdb': 'BertForSequenceClassification', 'bert_base_token_classifier_conll03': 'BertForTokenClassification', 'bert_base_token_classifier_few_nerd': 'BertForTokenClassification', @@ -12877,6 +12888,7 @@ class Spellbook: 'bert_base_uncased_contracts': 'BertEmbeddings', 'bert_base_uncased_legal': 'BertEmbeddings', 'roberta_base_zero_shot_classifier_nli': 'RoBertaForZeroShotClassification', + 'xlm_roberta_large_zero_shot_classifier_xnli_anli':'XlmRoBertaForZeroShotClassification', 'bert_biolink_base': 'BertEmbeddings', 'bert_biolink_large': 'BertEmbeddings', 'bert_biomed_pubmed_uncased': 'BertEmbeddings', @@ -15348,6 +15360,9 @@ class Spellbook: 'bert_wiki_books_squad2': 'BertEmbeddings', 'bert_wiki_books_sst2': 'BertEmbeddings', 'beto_sentiment': 'BertForSequenceClassification', + 'bge_small': 'BGEEmbeddings', + 'bge_base': 'BGEEmbeddings', + 'bge_large': 'BGEEmbeddings', 'binary2image': 'BinaryToImage', 'biobert_clinical_base_cased': 'BertEmbeddings', 'biobert_discharge_base_cased': 'BertEmbeddings', @@ -16713,6 +16728,7 @@ class Spellbook: 'ner_diseases_large': 'MedicalNerModel', 'ner_dl': 'NerDLModel', 'ner_dl_bert': 'NerDLModel', + 'ner_protein_glove': 'NerDLModel', 'ner_dl_bert_base_cased': 'NerDLModel', 'ner_dl_sentence': 'NerDLModel', 'ner_drugprot_clinical': 'MedicalNerModel', @@ -16858,6 +16874,8 @@ class Spellbook: 'onto_small_bert_L4_256': 'NerDLModel', 'onto_small_bert_L4_512': 'NerDLModel', 'onto_small_bert_L8_512': 'NerDLModel', + 'openai.completion': 'OpenAICompletion', + 'openai.embeddings': 'OpenAIEmbeddings', 'opus_mt_aav_en': 'MarianTransformer', 'opus_mt_aed_es': 'MarianTransformer', 'opus_mt_af_de': 'MarianTransformer', diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py index 088a14a6..b2f0f22c 100644 --- a/nlu/universe/annotator_class_universe.py +++ b/nlu/universe/annotator_class_universe.py @@ -15,6 +15,7 @@ class AnnoClassRef: JSL_anno2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = { A_N.E5_SENTENCE_EMBEDDINGS: 'E5Embeddings', + A_N.BGE_SENTENCE_EMBEDDINGS: 'BGEEmbeddings', A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS: 'InstructorEmbeddings', A_N.WHISPER_FOR_CTC: 'WhisperForCTC', @@ -101,8 +102,13 @@ class AnnoClassRef: A_N.DISTIL_BERT_EMBEDDINGS: 'DistilBertEmbeddings', A_N.DISTIL_BERT_FOR_SEQUENCE_CLASSIFICATION: 'DistilBertForSequenceClassification', A_N.DISTIL_BERT_FOR_ZERO_SHOT_CLASSIFICATION: 'DistilBertForZeroShotClassification', + + A_N.DEBERTA_FOR_ZERO_SHOT_CLASSIFICATION: 'DeBertaForZeroShotClassification', + A_N.BERT_FOR_SEQUENCE_CLASSIFICATION: 'BertForSequenceClassification', + A_N.XLM_ROBERTA_FOR_ZERO_SHOT_CLASSIFICATION: 'XlmRoBertaForZeroShotClassification', A_N.BERT_FOR_ZERO_SHOT_CLASSIFICATION: 'BertForZeroShotClassification', + A_N.BART_FOR_ZERO_SHOT_CLASSIFICATION: 'BartForZeroShotClassification', A_N.ELMO_EMBEDDINGS: 'ElmoEmbeddings', A_N.LONGFORMER_EMBEDDINGS: 'LongformerEmbeddings', A_N.ROBERTA_FOR_ZERO_SHOT_CLASSIFICATION: 'RoBertaForZeroShotClassification', @@ -125,6 +131,8 @@ class AnnoClassRef: A_N.ALBERT_FOR_SEQUENCE_CLASSIFICATION: 'AlbertForSequenceClassification', A_N.XLNET_FOR_SEQUENCE_CLASSIFICATION: 'XlnetForSequenceClassification', A_N.GPT2: 'GPT2Transformer', + A_N.OPENAI_COMPLETION : 'OpenAICompletion', + A_N.OPENAI_EMBEDDINGS: 'OpenAIEmbeddings', A_N.DEBERTA_WORD_EMBEDDINGS: 'DeBertaEmbeddings', A_N.DEBERTA_FOR_TOKEN_CLASSIFICATION: 'DeBertaForTokenClassification', A_N.CAMEMBERT_EMBEDDINGS: 'CamemBertEmbeddings', diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py index e94e4f4e..908f047f 100644 --- a/nlu/universe/component_universes.py +++ b/nlu/universe/component_universes.py @@ -9,10 +9,16 @@ from nlu.components.classifiers.asr.wav2Vec import Wav2Vec from nlu.components.classifiers.asr_hubert.hubert import Hubert from nlu.components.classifiers.asr_whisper.whisper import Whisper +from nlu.components.classifiers.xlm_roberta_zero_shot_classification.xlm_roberta_zero_shot import XlmRobertaZeroShotClassifier from nlu.components.classifiers.bert_zero_shot_classification.bert_zero_shot import BertZeroShotClassifier +from nlu.components.classifiers.bart_zero_shot_classification.bart_zero_shot import BartZeroShotClassifier from nlu.components.classifiers.classifier_dl.classifier_dl import ClassifierDl from nlu.components.classifiers.distil_bert_zero_shot_classification.distil_bert_zero_shot import \ DistilBertZeroShotClassifier + +from nlu.components.classifiers.deberta_zero_shot.deberta_zero_shot import DeBertaZeroShotClassifier + + from nlu.components.classifiers.generic_classifier.generic_classifier import GenericClassifier from nlu.components.classifiers.image_classification_swin.swin import SwinImageClassifier from nlu.components.classifiers.image_classification_vit.convnext_image_classification import ConvNextImageClassifier @@ -39,6 +45,7 @@ from nlu.components.classifiers.seq_xlm_roberta.seq_xlm_roberta import SeqXlmRobertaClassifier from nlu.components.classifiers.seq_xlnet.seq_xlnet import SeqXlnetClassifier from nlu.components.classifiers.span_bert.span_bert import SpanBertClassifier +from nlu.components.classifiers.span_albert.span_albert import SpanAlbertClassifier from nlu.components.classifiers.span_camembert.span_camembert import SpanCamemBert from nlu.components.classifiers.span_deberta.span_deberta import SpanDeBertaClassifier from nlu.components.classifiers.span_distilbert.span_distilbert import SpanDistilBertClassifier @@ -76,6 +83,7 @@ from nlu.components.embeddings.longformer.longformer import Longformer from nlu.components.embeddings.roberta.roberta import Roberta from nlu.components.embeddings.sentence_e5.E5SentenceEmbedding import E5 +from nlu.components.embeddings.sentence_bge.BGESentenceEmbedding import BGE from nlu.components.embeddings.sentence_bert.BertSentenceEmbedding import BertSentence from nlu.components.embeddings.sentence_roberta.RobertaSentenceEmbedding import RobertaSentence from nlu.components.embeddings.sentence_mpnet.MPNetSentenceEmbedding import MPNetSentence @@ -100,6 +108,8 @@ from nlu.components.sentence_detectors.pragmatic_sentence_detector.sentence_detector import PragmaticSentenceDetector from nlu.components.seq2seqs.bart_transformer.bart_transformer import SparkNLPBartTransformer from nlu.components.seq2seqs.gpt2.gpt2 import GPT2 +from nlu.components.seq2seqs.openai_completion.openai_completion import OpenaiCompletion +from nlu.components.embeddings.openai_embeddings.openai_embeddings import OpenaiEmbeddings from nlu.components.seq2seqs.marian.marian import Marian from nlu.components.seq2seqs.med_summarizer.med_summarizer import MedSummarizer from nlu.components.seq2seqs.med_text_generator.med_text_generator import MedTextGenerator @@ -129,6 +139,7 @@ from nlu.ocr_components.table_extractors.doc_table_extractor.doc2table import Doc2TextTable from nlu.ocr_components.table_extractors.pdf_table_extractor.pdf2table import PDF2TextTable from nlu.ocr_components.table_extractors.ppt_table_extractor.ppt2table import PPT2TextTable +from nlu.ocr_components.visual_classifiers.visual_document_classifier.visual_document_classifier import VisualDocClassifier from nlu.ocr_components.text_recognizers.doc2text.doc2text import Doc2Text from nlu.ocr_components.text_recognizers.img2text.img2text import Img2Text from nlu.ocr_components.text_recognizers.pdf2text.pdf2text import Pdf2Text @@ -140,10 +151,12 @@ from nlu.ocr_components.utils.image_split_regions.image_split_regions import ImageSplitRegions # from nlu.ocr_components.visual_classifiers.visual_doc_classifier.visual_doc_classifier import VisualDocClassifier from nlu.pipe.col_substitution.col_substitution_HC import * -from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols +from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols, \ + substitute_document_classifier_text_cols from nlu.pipe.col_substitution.col_substitution_OS import * from nlu.pipe.extractors.extractor_configs_HC import * -from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config +from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config, \ + default_visual_classifier_config from nlu.pipe.extractors.extractor_configs_OS import * from nlu.pipe.nlu_component import NluComponent from nlu.universe.annotator_class_universe import AnnoClassRef @@ -2418,6 +2431,49 @@ class ComponentUniverse: has_storage_ref=True, is_storage_ref_producer=True, ), + A.BGE_SENTENCE_EMBEDDINGS: partial(NluComponent, + name=A.BGE_SENTENCE_EMBEDDINGS, + type=T.DOCUMENT_EMBEDDING, + get_default_model=BGE.get_default_model, + get_pretrained_model=BGE.get_pretrained_model, + pdf_extractor_methods={'default': default_sentence_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sent_embed_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING, + node=NLP_FEATURE_NODES.nodes[A.BGE_SENTENCE_EMBEDDINGS], + description='Sentence-level embeddings using BGE. E5, a weakly supervised text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.).', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.BGE_SENTENCE_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.BGE_SENTENCE_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), + + + A.OPENAI_EMBEDDINGS: partial(NluComponent, + name=A.OPENAI_EMBEDDINGS, + type=T.DOCUMENT_EMBEDDING, + get_default_model=OpenaiEmbeddings.get_default_model, + get_pretrained_model=OpenaiEmbeddings.get_pretrained_model, + pdf_extractor_methods={'default': default_sentence_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sent_embed_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING, + node=NLP_FEATURE_NODES.nodes[A.OPENAI_EMBEDDINGS], + description='', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.OPENAI_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.OPENAI_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), + A.BERT_FOR_TOKEN_CLASSIFICATION: partial(NluComponent, name=A.BERT_FOR_TOKEN_CLASSIFICATION, type=T.TRANSFORMER_TOKEN_CLASSIFIER, @@ -3027,6 +3083,36 @@ class ComponentUniverse: jsl_anno_py_class=ACR.JSL_anno2_py_class[ A.DISTIL_BERT_FOR_ZERO_SHOT_CLASSIFICATION], ), + + + A.DEBERTA_FOR_ZERO_SHOT_CLASSIFICATION: partial(NluComponent, + name=A.DEBERTA_FOR_ZERO_SHOT_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=DeBertaZeroShotClassifier.get_default_model, + get_pretrained_model=DeBertaZeroShotClassifier.get_pretrained_model, + pdf_extractor_methods={ + 'default': default_seq_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[ + A.DEBERTA_FOR_ZERO_SHOT_CLASSIFICATION], + description='DeBerta Zero Shot Classifier.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.DEBERTA_FOR_ZERO_SHOT_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.DEBERTA_FOR_ZERO_SHOT_CLASSIFICATION], + ), + + + + + + + A.BERT_FOR_ZERO_SHOT_CLASSIFICATION: partial(NluComponent, name=A.BERT_FOR_ZERO_SHOT_CLASSIFICATION, type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, @@ -3047,7 +3133,26 @@ class ComponentUniverse: jsl_anno_py_class=ACR.JSL_anno2_py_class[ A.BERT_FOR_ZERO_SHOT_CLASSIFICATION], ), - + A.BART_FOR_ZERO_SHOT_CLASSIFICATION: partial(NluComponent, + name=A.BART_FOR_ZERO_SHOT_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=BartZeroShotClassifier.get_default_model, + get_pretrained_model=BartZeroShotClassifier.get_pretrained_model, + pdf_extractor_methods={'default': default_seq_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[ + A.BART_FOR_ZERO_SHOT_CLASSIFICATION], + description='Bart Zero Shot Classifier.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.BART_FOR_ZERO_SHOT_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.BART_FOR_ZERO_SHOT_CLASSIFICATION], + ), A.GPT2: partial(NluComponent, name=A.GPT2, type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, @@ -3066,6 +3171,24 @@ class ComponentUniverse: jsl_anno_py_class=ACR.JSL_anno2_py_class[A.GPT2], ), + A.OPENAI_COMPLETION: partial(NluComponent, + name=A.OPENAI_COMPLETION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=OpenaiCompletion.get_default_model, + get_pretrained_model=OpenaiCompletion.get_pretrained_model, + pdf_extractor_methods={'default': default_gpt2_config, 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_gpt2_cols, # TIODO TESt + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.OPENAI_COMPLETION], + description='', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.OPENAI_COMPLETION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.OPENAI_COMPLETION], + ), + A.WORD_2_VEC: partial(NluComponent, # TOOD name=A.WORD_2_VEC, type=T.TOKEN_EMBEDDING, @@ -3172,6 +3295,25 @@ class ComponentUniverse: computation_context=ComputeContexts.spark, output_context=ComputeContexts.spark, ), + A.ALBERT_FOR_QUESTION_ANSWERING: partial(NluComponent, + name=A.ALBERT_FOR_QUESTION_ANSWERING, + jsl_anno_class_id=A.ALBERT_FOR_QUESTION_ANSWERING, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.ALBERT_FOR_QUESTION_ANSWERING], + node=NLP_FEATURE_NODES.nodes[A.ALBERT_FOR_QUESTION_ANSWERING], + get_default_model=SpanAlbertClassifier.get_default_model, + get_pretrained_model=SpanAlbertClassifier.get_pretrained_model, + type=T.QUESTION_SPAN_CLASSIFIER, + pdf_extractor_methods={ + 'default': default_span_classifier_config, + 'default_full': default_full_span_classifier_config, }, + pdf_col_name_substitutor=substitute_span_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + description='TODO', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + ), A.DE_BERTA_FOR_QUESTION_ANSWERING: partial(NluComponent, name=A.DE_BERTA_FOR_QUESTION_ANSWERING, @@ -4194,31 +4336,6 @@ class ComponentUniverse: applicable_file_types=['DOCX', 'DOC'] ), - # O_A.VISUAL_DOCUMENT_CLASSIFIER: partial(NluComponent, - # name=O_A.VISUAL_DOCUMENT_CLASSIFIER, - # type=T.PDF_BUILDER, - # get_default_model=VisualDocClassifier.get_default_model, - # get_pretrained_model=VisualDocClassifier.get_pretrained_model, - # - # pdf_extractor_methods={'default': default_visual_classifier_config}, - # # TODO EXtractor - # pdf_col_name_substitutor=substitute_recognized_text_cols, - # # TODO substitor - # output_level=L.DOCUMENT, - # node=OCR_FEATURE_NODES.nodes[O_A.VISUAL_DOCUMENT_CLASSIFIER], - # description='Convert text to PDF file', - # provider=ComponentBackends.ocr, - # license=Licenses.ocr, - # computation_context=ComputeContexts.spark, - # output_context=ComputeContexts.spark, - # jsl_anno_class_id=O_A.VISUAL_DOCUMENT_CLASSIFIER, - # jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[ - # O_A.VISUAL_DOCUMENT_CLASSIFIER], - # applicable_file_types=['JPG', 'JPEG'] - # ), - # - - O_A.IMAGE_TABLE_CELL_DETECTOR: partial(NluComponent, name=O_A.IMAGE_TABLE_CELL_DETECTOR, type=T.TEXT_RECOGNIZER, @@ -4297,6 +4414,27 @@ class ComponentUniverse: ), + O_A.VISUAL_DOCUMENT_CLASSIFIER: partial(NluComponent, + name=O_A.VISUAL_DOCUMENT_CLASSIFIER, + type=T.PDF_BUILDER, + get_default_model=VisualDocClassifier.get_default_model, + pdf_extractor_methods={'default': default_visual_classifier_config}, + # TODO EXtractor + pdf_col_name_substitutor=substitute_document_classifier_text_cols, + # TODO substitor + output_level=L.DOCUMENT, + node=OCR_FEATURE_NODES.nodes[O_A.VISUAL_DOCUMENT_CLASSIFIER], + description='Convert text to PDF file', + provider=ComponentBackends.ocr, + license=Licenses.ocr, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=O_A.VISUAL_DOCUMENT_CLASSIFIER, + jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[ + O_A.VISUAL_DOCUMENT_CLASSIFIER], + applicable_file_types=['JPG', 'JPEG'] + ), + O_A.IMAGE2HOCR: partial(NluComponent, name=O_A.IMAGE2HOCR, type=T.OCR_UTIL, diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py index c7fbf027..655f08ca 100644 --- a/nlu/universe/feature_node_ids.py +++ b/nlu/universe/feature_node_ids.py @@ -91,6 +91,7 @@ class NLP_NODE_IDS: CAMEMBERT_FOR_TOKEN_CLASSIFICATION = JslAnnoId('camenbert_for_token_classification') CAMEMBERT_FOR_SEQUENCE_CLASSIFICATION = JslAnnoId('camenbert_for_sequence_classification') E5_SENTENCE_EMBEDDINGS = JslAnnoId('e5_sentence_embeddings') + BGE_SENTENCE_EMBEDDINGS = JslAnnoId('bge_sentence_embeddings') BERT_SENTENCE_EMBEDDINGS = JslAnnoId('bert_sentence_embeddings') DISTIL_BERT_EMBEDDINGS = JslAnnoId('distil_bert_embeddings') DISTIL_BERT_FOR_TOKEN_CLASSIFICATION = JslAnnoId('distil_bert_for_token_classification') @@ -117,7 +118,12 @@ class NLP_NODE_IDS: DISTIL_BERT_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('distil_bert_zero_shot') + XLM_ROBERTA_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('xlm_roberta_zero_shot') + + DEBERTA_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('deberta_zero_shot') + BERT_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('bert_zero_shot') + BART_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('bart_zero_shot') ROBERTA_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('roberta_zero_shot') UNIVERSAL_SENTENCE_ENCODER = JslAnnoId('universal_sentence_encoder') XLM_ROBERTA_EMBEDDINGS = JslAnnoId('xlm_roberta_embeddings') @@ -131,6 +137,8 @@ class NLP_NODE_IDS: ALBERT_FOR_SEQUENCE_CLASSIFICATION = JslAnnoId('albert_for_sequence_classification') XLNET_FOR_SEQUENCE_CLASSIFICATION = JslAnnoId('xlnet_for_sequence_classification') GPT2 = JslAnnoId('gpt2') + OPENAI_COMPLETION = JslAnnoId('openai_completion') + OPENAI_EMBEDDINGS = JslAnnoId('openai_embeddings') WORD_2_VEC = JslAnnoId('word_2_vec') DEBERTA_WORD_EMBEDDINGS = JslAnnoId('deberta') DEBERTA_FOR_SEQUENCE_CLASSIFICATION = JslAnnoId('deberta_for_sequence_classification') diff --git a/nlu/universe/feature_node_universes.py b/nlu/universe/feature_node_universes.py index 88c54a28..2ff9830f 100644 --- a/nlu/universe/feature_node_universes.py +++ b/nlu/universe/feature_node_universes.py @@ -76,6 +76,7 @@ class NLP_FEATURE_NODES: # or Mode Node? A.INSTRUCTOR_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.INSTRUCTOR_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]), A.E5_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.E5_SENTENCE_EMBEDDINGS, [F.DOCUMENT],[F.SENTENCE_EMBEDDINGS]), + A.BGE_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.BGE_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]), A.MPNET_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.MPNET_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]), A.PARTIALLY_IMPLEMENTED: NlpFeatureNode(A.PARTIALLY_IMPLEMENTED, [F.UNKOWN], [F.UNKOWN]), @@ -237,8 +238,12 @@ class NLP_FEATURE_NODES: # or Mode Node? A.CAMEMBERT_FOR_SEQUENCE_CLASSIFICATION: NlpFeatureNode(A.CAMEMBERT_FOR_SEQUENCE_CLASSIFICATION, [F.DOCUMENT, F.TOKEN], [F.SEQUENCE_CLASSIFICATION]), + A.XLM_ROBERTA_FOR_ZERO_SHOT_CLASSIFICATION: NlpFeatureNode(A.XLM_ROBERTA_FOR_ZERO_SHOT_CLASSIFICATION, [F.DOCUMENT, F.TOKEN], + [F.SEQUENCE_CLASSIFICATION]), A.BERT_FOR_ZERO_SHOT_CLASSIFICATION: NlpFeatureNode(A.BERT_FOR_ZERO_SHOT_CLASSIFICATION, [F.DOCUMENT, F.TOKEN], [F.SEQUENCE_CLASSIFICATION]), + A.BART_FOR_ZERO_SHOT_CLASSIFICATION: NlpFeatureNode(A.BART_FOR_ZERO_SHOT_CLASSIFICATION, [F.DOCUMENT, F.TOKEN], + [F.SEQUENCE_CLASSIFICATION]), A.DEBERTA_FOR_SEQUENCE_CLASSIFICATION: NlpFeatureNode(A.BERT_FOR_SEQUENCE_CLASSIFICATION, [F.DOCUMENT, F.TOKEN], [F.SEQUENCE_CLASSIFICATION]), @@ -250,6 +255,10 @@ class NLP_FEATURE_NODES: # or Mode Node? [F.DOCUMENT, F.TOKEN], [F.SEQUENCE_CLASSIFICATION]), + A.DEBERTA_FOR_ZERO_SHOT_CLASSIFICATION: NlpFeatureNode(A.DEBERTA_FOR_ZERO_SHOT_CLASSIFICATION, + [F.DOCUMENT, F.TOKEN], + [F.SEQUENCE_CLASSIFICATION]), + A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION: NlpFeatureNode(A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION, [F.DOCUMENT, F.TOKEN], [F.SEQUENCE_CLASSIFICATION]), @@ -269,6 +278,8 @@ class NLP_FEATURE_NODES: # or Mode Node? [F.DOCUMENT, F.TOKEN], [F.SEQUENCE_CLASSIFICATION]), A.GPT2: NlpFeatureNode(A.GPT2, [F.DOCUMENT], [F.DOCUMENT_GENERATED]), + A.OPENAI_COMPLETION: NlpFeatureNode(A.OPENAI_COMPLETION, [F.DOCUMENT], [F.DOCUMENT_GENERATED]), + A.OPENAI_EMBEDDINGS: NlpFeatureNode(A.OPENAI_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]), A.WORD_2_VEC: NlpFeatureNode(A.WORD_2_VEC, [F.TOKEN], [F.WORD_EMBEDDINGS]), A.BERT_SENTENCE_CHUNK_EMBEDDINGS: NlpFeatureNode(A.BERT_SENTENCE_CHUNK_EMBEDDINGS, [F.DOCUMENT], [F.NAMED_ENTITY_CONVERTED]), @@ -291,7 +302,7 @@ class OCR_FEATURE_NODES: F = OCR_FEATURES nodes = { A.VISUAL_DOCUMENT_CLASSIFIER: OcrFeatureNode(A.VISUAL_DOCUMENT_CLASSIFIER, [F.HOCR], - [F.VISUAL_CLASSIFIER_PREDICTION, F.VISUAL_CLASSIFIER_CONFIDENCE]), + [F.VISUAL_CLASSIFIER_PREDICTION, F.VISUAL_CLASSIFIER_CONFIDENCE, F.FILE_PATH]), A.IMAGE2HOCR: OcrFeatureNode(A.IMAGE2HOCR, [F.OCR_IMAGE], [F.HOCR]), diff --git a/setup.py b/setup.py index 743cff6a..e428c650 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,4 @@ -"""A setuptools based setup module. - -See: -https://packaging.python.org/en/latest/distributing.html -https://github.com/pypa/sampleproject -""" +import nlu from codecs import open from os import path @@ -27,7 +22,7 @@ name='nlu', - version='5.1.4', + version=nlu.version(), description='John Snow Labs NLU provides state of the art algorithms for NLP&NLU with 20000+ of pretrained models in 200+ languages. It enables swift and simple development and research with its powerful Pythonic and Keras inspired API. It is powerd by John Snow Labs powerful Spark NLP library.', @@ -58,3 +53,4 @@ packages=find_packages(exclude=['test*', 'tmp*']), include_package_data=True ) + diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/bart_zero_shot_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/bart_zero_shot_tests.py new file mode 100644 index 00000000..bcf08a5a --- /dev/null +++ b/tests/nlu_core_tests/component_tests/classifier_tests/bart_zero_shot_tests.py @@ -0,0 +1,18 @@ +import unittest + +from nlu import * + + +class TestBartZeroShotClassifier(unittest.TestCase): + def test_bart_zero_shot_classifier(self): + pipe = nlu.load("en.bart.zero_shot_classifier", verbose=True) + df = pipe.predict( + ["I loved this movie when I was a child."], + output_level="sentence" + ) + for c in df.columns: + print(df[c]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/deberta_zero_shot_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/deberta_zero_shot_tests.py new file mode 100644 index 00000000..1863df6f --- /dev/null +++ b/tests/nlu_core_tests/component_tests/classifier_tests/deberta_zero_shot_tests.py @@ -0,0 +1,15 @@ +import unittest + +from nlu import * + + +class TestDeBertaZeroShotClassifier(unittest.TestCase): + def test_bert_zero_shot_classifier(self): + pipe = nlu.load("en.deberta.zero_shot_classifier") + df = pipe.predict(["I loved this movie when I was a child."]) + for c in df.columns: + print(df[c]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/xlm_roberta_zero_shot_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/xlm_roberta_zero_shot_tests.py new file mode 100644 index 00000000..aea2394e --- /dev/null +++ b/tests/nlu_core_tests/component_tests/classifier_tests/xlm_roberta_zero_shot_tests.py @@ -0,0 +1,18 @@ +import unittest + +from nlu import * + + +class TestXlmRobertaZeroShotClassifier(unittest.TestCase): + def test_xlmroberta_zero_shot_classifier(self): + pipe = nlu.load('xx.xlm_roberta.zero_shot_classifier', verbose=True) + df = pipe.predict( + ["I loved this movie when I was a child."], + output_level="sentence" + ) + for c in df.columns: + print(df[c]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/openai_embeddings_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/openai_embeddings_tests.py new file mode 100644 index 00000000..efa2cefc --- /dev/null +++ b/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/openai_embeddings_tests.py @@ -0,0 +1,19 @@ +import unittest +from nlu import * +import os + + +class TestOpenAIEmbeddings(unittest.TestCase): + def test_openAI_embeds(self): + + pipe = nlu.load("openai.embeddings") + + pipe['openai_embeddings'].setModel('text-embedding-ada-002') + + res = pipe.predict(["The food was delicious and the waiter...","canine companions say"], output_level='document') + + for c in res: + print(res[c]) + +if __name__ == "__main__": + unittest.main() diff --git a/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_bge_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_bge_tests.py new file mode 100644 index 00000000..025190e7 --- /dev/null +++ b/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_bge_tests.py @@ -0,0 +1,18 @@ +import unittest + +from nlu import * + + +class TestBGESentenceEmbeddings(unittest.TestCase): + def test_bge_embeds(self): + pipe = nlu.load("en.embed_sentence.bge_small", verbose=True) + res = pipe.predict( + "query: how much protein should a female eat", + output_level="document" + ) + for c in res: + print(res[c]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/nlu_core_tests/component_tests/seq2seq/openai_completion_test.py b/tests/nlu_core_tests/component_tests/seq2seq/openai_completion_test.py new file mode 100644 index 00000000..0537ba7a --- /dev/null +++ b/tests/nlu_core_tests/component_tests/seq2seq/openai_completion_test.py @@ -0,0 +1,24 @@ +import unittest + +from nlu import * + +import os + + +class TestOpenAICompletion(unittest.TestCase): + def test_openai_completion(self): + + pipe = nlu.load("openai.completion", apple_silicon=True) + + pipe['openai_completion'].setModel('text-davinci-003') + pipe['openai_completion'].setMaxTokens(50) + + res = pipe.predict( + ["Generate a restaurant review.", "Write a review for a local eatery.", "Create a JSON with a review"], + output_level='document') + + for c in res: + print(res[c]) + +if __name__ == "__main__": + unittest.main() diff --git a/tests/nlu_ocr_tests/cv_test.png b/tests/nlu_ocr_tests/cv_test.png new file mode 100644 index 00000000..331e7261 Binary files /dev/null and b/tests/nlu_ocr_tests/cv_test.png differ diff --git a/tests/nlu_ocr_tests/letter.jpg b/tests/nlu_ocr_tests/letter.jpg new file mode 100644 index 00000000..2fb6cdfb Binary files /dev/null and b/tests/nlu_ocr_tests/letter.jpg differ diff --git a/tests/nlu_ocr_tests/ocr_pdf_builder_tests.py b/tests/nlu_ocr_tests/ocr_pdf_builder_tests.py index d0f84cae..9e5db019 100644 --- a/tests/nlu_ocr_tests/ocr_pdf_builder_tests.py +++ b/tests/nlu_ocr_tests/ocr_pdf_builder_tests.py @@ -1,37 +1,37 @@ -# import tests.secrets as sct -# import unittest -# import nlu -# -# SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE -# AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID -# AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY -# JSL_SECRET = sct.JSL_SECRET -# OCR_SECRET = sct.OCR_SECRET -# OCR_LICENSE = sct.OCR_LICENSE -# # nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) -# -# class OcrTest(unittest.TestCase): -# -# def test_text_to_pdf(self): -# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) -# # text that we generate PDF to has to come from an image struct! -# # We need convert text to img struct! -# -# p = nlu.load('ppt2table',verbose=True) -# dfs = p.predict([f1,f2]) -# for df in dfs : -# print(df) -# -# def test_DOC_table_extraction(self): -# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) -# f1 = '/home/ckl/Documents/freelance/jsl/nlu/nlu4realgit2/tests/datasets/ocr/table_DOCX/doc2.docx' -# p = nlu.load('doc2table',verbose=True) -# dfs = p.predict([f1]) -# for df in dfs : -# print(df) -# -# -# -# if __name__ == '__main__': -# unittest.main() -# +import tests.secrets as sct +import unittest +import nlu + +SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE +AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID +AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY +JSL_SECRET = sct.JSL_SECRET +OCR_SECRET = sct.OCR_SECRET +OCR_LICENSE = sct.OCR_LICENSE +# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) + +class OcrTest(unittest.TestCase): + + def test_text_to_pdf(self): + nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) + # text that we generate PDF to has to come from an image struct! + # We need convert text to img struct! + + p = nlu.load('ppt2table',verbose=True) + dfs = p.predict([f1,f2]) + for df in dfs : + print(df) + + def test_DOC_table_extraction(self): + nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) + f1 = '/home/ckl/Documents/freelance/jsl/nlu/nlu4realgit2/tests/datasets/ocr/table_DOCX/doc2.docx' + p = nlu.load('doc2table',verbose=True) + dfs = p.predict([f1]) + for df in dfs : + print(df) + + + +if __name__ == '__main__': + unittest.main() + diff --git a/tests/nlu_ocr_tests/ocr_table_extraction_tests.py b/tests/nlu_ocr_tests/ocr_table_extraction_tests.py index c30277c5..4bd86178 100644 --- a/tests/nlu_ocr_tests/ocr_table_extraction_tests.py +++ b/tests/nlu_ocr_tests/ocr_table_extraction_tests.py @@ -1,4 +1,7 @@ -import tests.secrets as sct +import os +import sys + +sys.path.append(os.getcwd()) import unittest import nlu nlu.auth(sct.SPARK_NLP_LICENSE,sct.AWS_ACCESS_KEY_ID,sct.AWS_SECRET_ACCESS_KEY,sct.JSL_SECRET, sct.OCR_LICENSE, sct.OCR_SECRET) @@ -34,10 +37,10 @@ def test_PPT_table_extraction(self): f1 = 'tests/datasets/ocr/table_PPT/54111.ppt' f2 ='tests/datasets/ocr/table_PPT/mytable.ppt' p = nlu.load('ppt2table',verbose=True) - dfs = p.predict([f1,f2]) + dfs = p.predict([f1 ]) for df in dfs : print(df) - + def test_DOC_table_extraction(self): f1 = 'tests/datasets/ocr/docx_with_table/doc2.docx' p = nlu.load('doc2table',verbose=True) diff --git a/tests/nlu_ocr_tests/ocr_visual_doc_classifier_tests.py b/tests/nlu_ocr_tests/ocr_visual_doc_classifier_tests.py index 889bd030..ec25389a 100644 --- a/tests/nlu_ocr_tests/ocr_visual_doc_classifier_tests.py +++ b/tests/nlu_ocr_tests/ocr_visual_doc_classifier_tests.py @@ -1,26 +1,34 @@ -# import tests.secrets as sct -# import unittest -# import nlu -# -# SPARK_NLP_LICENSE = sct.SPARK_NLP_LICENSE -# AWS_ACCESS_KEY_ID = sct.AWS_ACCESS_KEY_ID -# AWS_SECRET_ACCESS_KEY = sct.AWS_SECRET_ACCESS_KEY -# JSL_SECRET = sct.JSL_SECRET -# OCR_SECRET = sct.OCR_SECRET -# OCR_LICENSE = sct.OCR_LICENSE -# # nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) -# -# class OcrTest(unittest.TestCase): -# -# def test_classify_document(self): -# nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) -# # text that we generate PDF to has to come from an image struct! -# # We need convert text to img struct! -# p = nlu.load('en.classify_image.tabacco',verbose=True) -# res = p.predict('/home/ckl/Documents/freelance/jsl/nlu/nlu4realgit2/tests/datasets/ocr/classification_images/letter.jpg') -# for r in res.columns: -# print(r[res]) -# -# if __name__ == '__main__': -# unittest.main() -# + +import os +import sys + +sys.path.append(os.getcwd()) +import unittest +import nlu + +os.environ["PYTHONPATH"] = "F:/Work/repos/nlu" +os.environ['PYSPARK_PYTHON'] = sys.executable +os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable +from johnsnowlabs import nlp, visual + +# nlp.install(json_license_path='license.json',visual=True) +nlp.start(visual=True) + +# print('hi') +class OcrTest(unittest.TestCase): + + def test_classify_document(self): + # nlu.auth(SPARK_NLP_LICENSE,AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,JSL_SECRET, OCR_LICENSE, OCR_SECRET) + # text that we generate PDF to has to come from an image struct! + # We need convert text to img struct! + p = nlu.load('en.classify_image.tabacco',verbose=True) + res = p.predict('cv_test.png') + for i,j in res.iterrows(): + print(i,j) + print(res) + # for r in res.columns: + # print(r[res]) + +if __name__ == '__main__': + unittest.main() +